def run_alignment(self, inputwav, outputalign, N=3): """ Execute the external program `julius` to align. The data related to the unit to time-align need to be previously fixed with: - set_phones(str) - set_tokens(str) @param inputwav (str - IN) the audio input file name, of type PCM-WAV 16000 Hz, 16 bits @param outputalign (str - OUT) the output file name @param N (int) N value of N-grams, used only if SLM (i.e. outext=walign) @return (str) A message of `julius`. """ outputalign = outputalign + "." + self._outext basename = os.path.splitext(inputwav)[0] if self._outext == "palign": self.gen_grammar_dependencies(basename) else: self.gen_slm_dependencies(basename) self.run_julius(inputwav, basename, outputalign) with codecs.open(outputalign, 'r', encoding) as f: lines = f.readlines() errorlines = "" message = "" entries = [] for line in lines: if line.find("Error: voca_load_htkdict")>-1 and line.find("not found")>-1: line = ToStrip( line ) line = line[line.find('"')+1:] line = line[:line.find('"')] if len(line)>0: entries = line.split() if len(entries) > 0: added = self.add_tiedlist(entries) if len(added) > 0: message = "The acoustic model was modified. The following entries were successfully added into the tiedlist: " message = message + " ".join(added) + "\n" self.run_julius(inputwav, basename, outputalign) with codecs.open(outputalign, 'r', encoding) as f: lines = f.readlines() for line in lines: if (line.startswith("Error:") or line.startswith("ERROR:")) and not " line " in line: errorlines = errorlines + line if "search failed" in line: message = "Julius search has failed to find the transcription in the audio file of this unit." errorlines = "Search error. "+ errorlines if len(errorlines) > 0: raise Exception(message + errorlines) return message
def get_phon_entry(self, entry): """ Return the phonetization of an entry. Unknown entries are not automatically phonetized. This is a pure dictionary-based method. @param `entry` (str) The token to phonetize. @return A string with the phonetization of `entry` or the unknown symbol. """ entry = ToStrip(entry) # Specific strings... for the italian transcription... # For the participation at the CLIPS-Evalita 2011 campaign. if entry.startswith(u"<") is True and entry.endswith(u">") is True: entry = entry[1:-1] # No entry! Nothing to do. if len(entry) == 0: return "" # Specific strings used in the CID transcription... # CID is Corpus of Interactional Data, http://sldr.org/sldr000720 if entry.startswith(u"gpd_") is True or entry.startswith(u"gpf_") is True: return "" # Specific strings used in SPPAS IPU segmentation... if entry.find(u"ipu_")>-1: return "" # Find entry in the dict as it is given _strphon = self._pdict.get_pron( entry ) # OK, the entry is properly phonetized. if _strphon != self._pdict.unkstamp: return self._map_phonentry( _strphon ) return self._pdict.unkstamp