def read_walign(self, filename): """ Read an alignment file in the standard format of Julius CSR engine. @param filename (str - IN) The input file name. @return Two lists of tuples: - None - (start-time end-time word score) """ tokens = [""] scores = [0] _wordalign = [] wordidx = -1 with codecs.open(filename, "r", encoding) as fp: lines = fp.readlines() for line in lines: # Each line is either a new annotation or nothing interesting! line = ToStrip(line) if line.startswith("=== begin forced alignment ==="): wordidx = 0 elif line.startswith("=== end forced alignment ==="): wordidx = -1 elif line.startswith("wseq1:"): line = line[6:] # each token tokens = line.split() if len(tokens) == 0: tokens = [""] elif line.startswith("cmscore1:"): line = line[9:] # confidence score of the pronunciation of each token scores = [float(s) for s in line.split()] if len(scores) == 0: scores = [0] elif line.startswith("[") and wordidx > -1: # New phonemes line = line.replace("[", "") line = line.replace("]", "") line = ToStrip(line) tab = line.split(" ") # tab 0: first frame # tab 1: last frame # tab 2: score of the segmentation (log proba) # tab 3: word loc_s = float(tab[0]) / 100.0 loc_e = float(tab[1]) / 100.0 _wordalign.append([loc_s, loc_e, tokens[wordidx], scores[wordidx]]) wordidx = wordidx + 1 # Adjust time values for wordidx in range(len(_wordalign)): # Fix the end of this annotation to the begin of the next one. loc_e = _wordalign[wordidx][1] if wordidx < (len(_wordalign) - 1): nextloc_s = _wordalign[wordidx + 1][0] else: nextloc_s = 0.0 if loc_e < nextloc_s: loc_e = nextloc_s _wordalign[wordidx][1] = loc_e return (None, _wordalign)
def read_palign(self, filename): """ Read an alignment file in the standard format of Julius CSR engine. @param filename (str - IN) The input file name. @return Two lists of tuples: - (start-time end-time phoneme score) - (start-time end-time word score) """ _phonalign = [] _wordalign = [] phonidx = -1 # phoneme index loc_s = 0.0 # phoneme start time loc_e = 0.0 # phoneme end time phonlist = [] wordseq = [] scores = [0] tokens = [""] with codecs.open(filename, "r", encoding) as fp: lines = fp.readlines() for line in lines: # Each line is either a new annotation or nothing interesting! line = ToStrip(line) if line.startswith("=== begin forced alignment ==="): phonidx = 0 elif line.startswith("=== end forced alignment ==="): phonidx = -1 elif line.startswith("phseq1:"): line = line[7:] line = ToStrip(line) wordseq = line.split("|") # get indexes of each word wordlist = [] _idx = -1 for w in wordseq: _wrdphseq = w.strip().split() _idx += len(_wrdphseq) wordlist.append(_idx) # get the list of phonemes (without word segmentation) line = line.replace("|", "") line = ToStrip(line) phonlist = line.split() elif line.startswith("cmscore1:"): line = line[9:] # confidence score of the pronunciation of each token scores = [float(s) for s in line.split()] if len(scores) == 0: scores = [0] elif line.startswith("sentence1:"): line = line[10:] # each token tokens = line.split() if len(tokens) == 0: tokens = [""] elif line.startswith("[") and phonidx > -1: # New phonemes line = line.replace("[", "") line = line.replace("]", "") line = ToStrip(line) tab = line.split(" ") # tab 0: first frame # tab 1: last frame # tab 2: score of the segmentation (log proba) # tab 3: triphone used loc_s = float(tab[0]) / 100.0 loc_e = float(tab[1]) / 100.0 if len(tab) > 3: # Put real phoneme instead of triphones _phonalign.append([loc_s, loc_e, phonlist[phonidx], tab[2]]) else: _phonalign.append([loc_s, loc_e, "", tab[2]]) phonidx = phonidx + 1 # Adjust time values and create wordalign wordidx = 0 # word index wordloc_s = 0.0 # word start time loc_e = 0.0 nextloc_s = 0.0 for phonidx in range(len(_phonalign)): # Fix the end of this annotation to the begin of the next one. loc_e = _phonalign[phonidx][1] if phonidx < (len(_phonalign) - 1): nextloc_s = _phonalign[phonidx + 1][0] else: nextloc_s = 0.0 if loc_e < nextloc_s: loc_e = nextloc_s _phonalign[phonidx][1] = loc_e # Override the segmentation score of the phone by # the score of the pronunciation of the word _phonalign[phonidx][3] = scores[wordidx] # add also the word? if phonidx == wordlist[wordidx]: _wordalign.append([wordloc_s, loc_e, tokens[wordidx], scores[wordidx]]) wordidx = wordidx + 1 wordloc_s = loc_e # last word, or the only entry in case of empty interval... if len(wordseq) - 1 == wordidx: _wordalign.append([wordloc_s, loc_e, tokens[wordidx - 1], scores[wordidx - 1]]) return (_phonalign, _wordalign)