def replace(self, utt): """Examine tokens and performs some replacements. A dictionary with symbols contains the replacements to operate. :param utt: (list) the utterance :returns: A list of strings """ # Specific case of float numbers sent = ' '.join(utt) sent = re.sub(u('([0-9])\.([0-9])'), u(r'\1 NUMBER_SEP_POINT \2'), sent) sent = re.sub(u('([0-9])\,([0-9])'), u(r'\1 NUMBER_SEP \2'), sent) sent = sppasUnicode(sent).to_strip() _utt = sent.split() # Other generic replacements _result = list() for s in _utt: if self.repl.is_key(s): s = s.replace(s, self.repl.replace(s)) _result.append(sppasUnicode(s).to_strip()) return _result
def format_token(entry): """ Remove the CR/LF, tabs, multiple spaces and others... and lowerise. :param entry: (str) a token :returns: formatted token """ t = sppasUnicode(entry).to_strip() return sppasUnicode(t).to_lower()
def format_token(entry): """Remove the CR/LF, tabs, multiple spaces and others... and lowerise. :param entry: (str) a token :returns: formatted token """ t = sppasUnicode(entry).to_strip() return sppasUnicode(t).to_lower()
def get_phon(self, entry): """Return the phonetization of an unknown entry. :param entry: (str) the string to phonetize :returns: a string with the proposed phonetization :raises: Exception if the word can NOT be phonetized """ _str = sppasUnicode(entry).to_strip() _str = sppasUnicode(_str).to_lower() if len(_str) > 0 and _str[-1].isalnum() is False: _str = _str[:-1] if len(_str) > 0 and _str[0].isalnum() is False: _str = _str[1:] if len(_str) == 0: return "" if len(entry) > LIMIT_SIZE: raise Exception # Find all pronunciations of segments with a longest matching algo. _tabstr = re.split("[-'_\s]", _str) pronlr = "" pronrl = "" for s in _tabstr: plr = self.__recurslr(s) plr = plr.strip() if len(plr) > 0: pronlr = pronlr + " " + plr prl = self.__recursrl(s) prl = prl.strip() if len(prl) > 0: pronrl = pronrl + " " + prl pronlr = pronlr.strip() pronrl = pronrl.strip() # Create the output pron = "" if len(pronlr) > 0: if len(pronrl) > 0: pron = self.dagphon.decompose(pronlr, pronrl) else: pron = self.dagphon.decompose(pronlr) else: if len(pronrl) > 0: pron = self.dagphon.decompose(pronrl) if len(pron) > 0: return pron raise Exception
def get_phon(self, entry): """ Return the phonetization of an unknown entry. :param entry: (str) the string to phonetize :returns: a string with the proposed phonetization :raises: Exception if the word can NOT be phonetized """ _str = sppasUnicode(entry).to_strip() _str = sppasUnicode(_str).to_lower() if len(_str) > 0 and _str[-1].isalnum() is False: _str = _str[:-1] if len(_str) > 0 and _str[0].isalnum() is False: _str = _str[1:] if len(_str) == 0: return "" if len(entry) > LIMIT_SIZE: raise Exception # Find all pronunciations of segments with a longest matching algo. _tabstr = re.split("[-'_\s]", _str) pronlr = "" pronrl = "" for s in _tabstr: plr = self.__recurslr(s) plr = plr.strip() if len(plr) > 0: pronlr = pronlr + " " + plr prl = self.__recursrl(s) prl = prl.strip() if len(prl) > 0: pronrl = pronrl + " " + prl pronlr = pronlr.strip() pronrl = pronrl.strip() # Create the output pron = "" if len(pronlr) > 0: if len(pronrl) > 0: pron = self.dagphon.decompose(pronlr, pronrl) else: pron = self.dagphon.decompose(pronlr) else: if len(pronrl) > 0: pron = self.dagphon.decompose(pronrl) if len(pron) > 0: return pron raise Exception
def append_activity(self, token, activity): """ Append a new activity. :param token: (str) The token of the tier TokensAlign :param activity: (str) Name of the activity """ sp = sppasUnicode(token) token = sp.to_strip() sp = sppasUnicode(activity) activity = sp.to_strip() if token not in self._activities: self._activities[token] = activity
def append_activity(self, token, activity): """Append a new activity. :param token: (str) The token of the tier TokensAlign :param activity: (str) Name of the activity """ sp = sppasUnicode(token) token = sp.to_strip() sp = sppasUnicode(activity) activity = sp.to_strip() if token not in self._activities: self._activities[token] = activity
def __stick_longest_lr(self, phrase, separator): """ Return the longest first word of a phrase. A longest matching algorithm is applied from left to right. :param phrase: (str) :returns: tuple of (index of the first longest token, the longest token) """ tab_toks = phrase.split(" ") token = tab_toks[0] i = len(tab_toks) if self.__vocab is None: return 1, token while i > 0: # try to aggregate all tokens token = separator.join(tab_toks) # next round will try without the last token tab_toks.pop() i -= 1 # find if this is a word in the vocabulary if self.__vocab.is_unk(token) is False: break # the first real token is the first given token return i, sppasUnicode(token).to_strip()
def add_pron(self, token, pron): """Add a token/pron to the dict. :param token: (str) Unicode string of the token to add :param pron: (str) A pronunciation in which the phonemes are separated by whitespace """ entry = sppasDictPron.format_token(token) new_pron = sppasUnicode(pron).to_strip() new_pron = new_pron.replace(" ", separators.phonemes) # Already a pronunciation for this token? cur_pron = "" if entry in self._dict: # ... don't append an already known pronunciation if self.is_pron_of(entry, new_pron) is False: cur_pron = self.get_pron(entry) + separators.variants else: cur_pron = self.get_pron(entry) new_pron = "" # Get the current pronunciation and append the new one new_pron = cur_pron + new_pron # Add (or change) the entry in the dict self._dict[entry] = new_pron
def add_pron(self, token, pron): """ Add a token/pron to the dict. :param token: (str) Unicode string of the token to add :param pron: (str) A pronunciation in which the phonemes are separated by whitespace """ entry = sppasDictPron.format_token(token) new_pron = sppasUnicode(pron).to_strip() new_pron = new_pron.replace(" ", sppasDictPron.PHONEMES_SEPARATOR) # Already a pronunciation for this token? cur_pron = "" if entry in self._dict: # ... don't append an already known pronunciation if self.is_pron_of(entry, new_pron) is False: cur_pron = self.get_pron(entry) + sppasDictPron.VARIANTS_SEPARATOR else: cur_pron = self.get_pron(entry) new_pron = "" # Get the current pronunciation and append the new one new_pron = cur_pron + new_pron # Add (or change) the entry in the dict self._dict[entry] = new_pron
def _map_variant(self, phonvariant): """ Map phonemes of only one variant of a phonetized entry. :param phonvariant: (str) One phonetization variant of an entry. """ phones = self._map_split_variant(phonvariant) subs = [] # Single phonemes for p in phones: mapped = self._map_table.map_entry(p) if len(mapped) > 0: subs.append(p + VARIANTS_SEPARATOR + mapped) else: subs.append(p) self._dag_phon.variants = 0 phon = sppasUnicode(self._dag_phon.decompose( " ".join(subs))).to_strip() # Remove un-pronounced phonemes!!! # By convention, they are represented by an underscore in the # mapping table. tmp = [] for p in phon.split(VARIANTS_SEPARATOR): r = [x for x in p.split(PHONEMES_SEPARATOR) if x != "_"] tmp.append(PHONEMES_SEPARATOR.join(r)) return VARIANTS_SEPARATOR.join(set(tmp))
def __init__(self, entry): """ Creates a Token instance. """ if entry is None: self.__entry = "" else: self.__entry = sppasUnicode(entry)
def get_boundary(self, phonemes): """Get the index of the syllable boundary (EXCRULES or GENRULES). Phonemes are separated with the symbol defined by separators.phonemes variable. :param phonemes: (str) Sequence of phonemes to syllabify :returns: (int) boundary index or -1 if phonemes don't match any rule. """ sp = sppasUnicode(phonemes) phonemes = sp.to_strip() phon_list = phonemes.split(separators.phonemes) classes = "" for phon in phon_list: classes += self.get_class(phon) # search into exception if classes in self.exception: return self.exception[classes] # search into general for key, val in self.general.items(): if len(key) == len(phon_list): return val return -1
def load(self, filename): """Load the rules from a file. :param filename: (str) Name of the file with the rules. """ self.reset() with open(filename, "r") as f: lines = f.readlines() f.close() for line_nb, line in enumerate(lines, 1): sp = sppasUnicode(line) line = sp.to_strip() wds = line.split() if len(wds) == 3: if wds[0] == "PHONCLASS": self.phonclass[wds[1]] = wds[2] elif wds[0] == "GENRULE": self.general[wds[1]] = int(wds[2]) elif wds[0] == "EXCRULE": self.exception[wds[1]] = int(wds[2]) if len(wds) == 7: if wds[0] == "OTHRULE": s = " ".join(wds[1:6]) self.gap[s] = int(wds[6])
def _parse_lines(self, lines): """Fill the transcription from the lines of the STM file.""" # the current tier to fill tier = None # Extract rows, create tiers and metadata. for i, line in enumerate(lines): line = sppasUnicode(line).to_strip() # a comment can contain metadata if sppasBaseSclite.is_comment(line): if tier is None: sppasBaseSclite._parse_comment(line, self) else: sppasBaseSclite._parse_comment(line, tier) # ignore comments and blank lines if sppasSTM.check_line(line, i + 1) is False: continue # check for the tier (find it or create it) tier = self.get_tier(line) # extract information of this annotation tab_line = line.split() sppasSTM._create_annotation(tab_line[3], tab_line[4], " ".join(tab_line[5:]), tier)
def _map_variant(self, phonvariant): """ Map phonemes of only one variant of a phonetized entry. :param phonvariant: (str) One phonetization variant of an entry. """ phones = self._map_split_variant(phonvariant) subs = [] # Single phonemes for p in phones: mapped = self._map_table.map_entry(p) if len(mapped) > 0: subs.append(p + sppasDictPron.VARIANTS_SEPARATOR + mapped) else: subs.append(p) self._dag_phon.variants = 0 phon = sppasUnicode(self._dag_phon.decompose(" ".join(subs))).to_strip() # Remove un-pronounced phonemes!!! # By convention, they are represented by an underscore in the # mapping table. tmp = [] for p in phon.split(sppasDictPron.VARIANTS_SEPARATOR): r = [x for x in p.split(sppasDictPron.PHONEMES_SEPARATOR) if x != "_"] tmp.append(sppasDictPron.PHONEMES_SEPARATOR.join(r)) return sppasDictPron.VARIANTS_SEPARATOR.join(set(tmp))
def __stick_longest_lr(self, phrase, separator): """Return the longest first word of a phrase. A longest matching algorithm is applied from left to right. :param phrase: (str) :returns: tuple of (index of the first longest token, the longest token) """ tab_toks = phrase.split(" ") token = tab_toks[0] i = len(tab_toks) if self.__vocab is None: return 1, token while i > 0: # try to aggregate all tokens token = separator.join(tab_toks) # next round will try without the last token tab_toks.pop() i -= 1 # find if this is a word in the vocabulary if self.__vocab.is_unk(token) is False: break # the first real token is the first given token return i, sppasUnicode(token).to_strip()
def get_phon_tokens(self, tokens, phonunk=True): """ Return the phonetization of a list of tokens, with the status. Unknown entries are automatically phonetized if `phonunk` is set to True. :param tokens: (list) The list of tokens to be phonetized. :param phonunk: (bool) Phonetize unknown words (or not). TODO: EOT is not fully supported. :returns: A list with the tuple (token, phon, status). """ tab = list() for entry in tokens: entry = entry.strip() phon = self._pdict.get_unkstamp() status = OK_ID # Enriched Orthographic Transcription Convention: # entry can be already in SAMPA. if entry.startswith("/") is True and entry.endswith("/") is True: phon = entry.strip("/") # It must use X-SAMPA, including minus character to separate phonemes. else: phon = self.get_phon_entry(entry) if phon == self._pdict.get_unkstamp(): status = ERROR_ID # A missing compound word? if "-" in entry or "'" in entry or "_" in entry: _tabpron = [ self.get_phon_entry(w) for w in re.split("[-'_]", entry) ] # OK, finally the entry is in the dictionary? if self._pdict.get_unkstamp() not in _tabpron: # ATTENTION: each part can have variants! must be decomposed. self._dag_phon.variants = 4 phon = sppasUnicode( self._dag_phon.decompose( " ".join(_tabpron))).to_strip() status = WARNING_ID if phon == self._pdict.get_unkstamp() and phonunk is True: try: phon = self._phonunk.get_phon(entry) status = WARNING_ID except: phon = self._pdict.get_unkstamp() status = ERROR_ID if len(phon) > 0: tab.append((entry, phon, status)) return tab
def set_meta(self, key, value): """ Set or update a metadata. :param key: (str) The key of the metadata. :param value: (str) The value assigned to the key. key, and value are formatted and stored in unicode. """ su = sppasUnicode(key) key = su.to_strip() su = sppasUnicode(value) value = su.to_strip() self.__metadata[key] = value
def get_units_julius(lines): """Return the units of a palign/walign file (in frames). :param lines: (List of str) :returns: List of tuples (start, end) """ units = list() i = 0 while "=== begin forced alignment ===" not in lines[i]: i += 1 if i > len(lines): raise IOError('Time units not found') while "=== end forced alignment ===" not in lines[i]: i += 1 if i > len(lines): raise IOError('Time units not found in alignment result') if lines[i].startswith('['): # New phonemes line = lines[i].replace("[", "") line = line.replace("]", "") line = sppasUnicode(line).to_strip() tab = line.split() # tab 0: first frame # tab 1: last frame # tab 2: score of the segmentation (log proba) # tab 3: triphone used units.append((int(tab[0]), int(tab[1]))) return units
def set_tiername(self, tier_name): """ Fix the tiername option. :param tier_name: (str) """ self._options['tiername'] = sppasUnicode(tier_name).to_strip()
def read(dir_name): """Return a list of (start-time end-time). :param dir_name: Name of the directory with the file to read. :returns: list of units """ filename = os.path.join(dir_name, ListOfTracks.DEFAULT_FILENAME) if os.path.exists(filename) is False: raise IOError('The list of tracks is missing of the directory ' '{:s}'.format(dir_name)) with open(filename, 'r') as fp: lines = fp.readlines() fp.close() # Each line corresponds to a track, # with a couple 'start end' of float values. _units = list() for line in lines: s = sppasUnicode(line) line = s.to_strip() _tab = line.split() if len(_tab) >= 2: _units.append((float(_tab[0]), float(_tab[1]))) return _units
def check_data(self): """ Check the given data to be aligned (phones and tokens). :returns: A warning message, or an empty string if check is OK. """ if len(self._phones) == 0: raise IOError("No data to time-align.") phones = sppasUnicode(self._phones).to_strip().split() tokens = sppasUnicode(self._tokens).to_strip().split() if len(tokens) != len(phones): message = "Tokens alignment disabled: not the same number of tokens in tokenization (%d) and phonetization (%d)."%(len(self._tokens), len(self._phones)) self._tokens = " ".join(["w_"+str(i) for i in range(len(self._phones))]) return message return ""
def set_phones(self, phones): """Fix the pronunciations of each token. :param phones: (str) Phonetization """ phones = sppasUnicode(phones).unicode() self._phones = phones
def set_tokens(self, tokens): """Fix the tokens. :param tokens: (str) Tokenization """ tokens = sppasUnicode(tokens).unicode() self._tokens = tokens
def get_count(self, token): """Return the count of a token. :param token: (str) The string of the token """ s = sppasUnicode(token).to_strip() return self.__entries.get(s, 0)
def set_description(self, description=""): """ Set the description of the controlled vocabulary. :param description: (str) """ su = sppasUnicode(description) self.__desc = su.to_strip()
def _create_annotation(begin, end, utterance, tier): """Add into the tier the annotation corresponding to data of a line.""" utterance = sppasUnicode(utterance).to_strip() labels = format_labels(utterance) location = sppasLocation( sppasInterval(sppasBaseSclite.make_point(begin), sppasBaseSclite.make_point(end))) tier.create_annotation(location, labels)
def add(self, entry): """ Add an entry into the list except if the entry is already inside. :param entry: (str) The entry to add in the word list :returns: (bool) """ s = sppasUnicode(entry) entry = s.to_strip() if self.__case_sensitive is False: s = sppasUnicode(entry) entry = s.to_lower() if entry not in self.__entries: self.__entries[entry] = None return True return False
def add(self, entry): """ Add an entry into the list except if the entry is already inside. :param entry: (str) The entry to add in the word list :returns: (bool) """ s = sppasUnicode(entry) entry = s.to_strip() if self.__case_sensitive is False: s = sppasUnicode(entry) entry = s.to_lower() if entry not in self.__entries: self.__entries[entry] = None return True return False
def set_tg_prefix_label(self, prefix): """Fix the prefix to add to each TG. :param prefix: (str) Default is 'tg_' """ sp = sppasUnicode(prefix) tg = sp.to_strip() if len(tg) > 0: self._options['tg_prefix_label'] = tg
def _create_annotation(begin, duration, word, score): """Return the annotation corresponding to data of a line.""" word = sppasUnicode(word).clear_whitespace() label = sppasLabel(sppasTag(word), score) begin = float(begin) end = begin + float(duration) location = sppasLocation( sppasInterval(sppasBaseSclite.make_point(begin), sppasBaseSclite.make_point(end))) return sppasAnnotation(location, label)
def add_message(self, message): """ Add a new message tips in the list of tips. :param message: (str) A help message. """ su = sppasUnicode(message) u_message = su.to_strip() if len(u_message) > 0: self._tips.append(u_message)
def add_message(self, message): """Add a new message tips in the list of tips. :param message: (str) A help message. """ su = sppasUnicode(message) u_message = su.to_strip() if len(u_message) > 0: self._tips.append(u_message)
def _parse_lines(self, lines): """ Fill the transcription from the lines of the CTM file. """ # the number of the current alternation in_alt = 0 # the annotations of the alternations alternates = dict() # the current tier to fill tier = None # Extract rows, create tiers and metadata. for i, line in enumerate(lines): line = sppasUnicode(line).to_strip() # a comment can contain metadata if sppasBaseSclite.is_comment(line): if tier is None: sppasBaseSclite._parse_comment(line, self) else: sppasBaseSclite._parse_comment(line, tier) # ignore comments and blank lines if sppasCTM.check_line(line, i + 1) is False: continue # check for the tier (find it or create it) tier = self.get_tier(line) # extract information of this annotation tab_line = line.strip().split() wavname, channel, begin, duration, word = tab_line[:5] score = sppasCTM.get_score(line) # check for an alternative annotation if begin == "*": if word == "<ALT_BEGIN>": alternates = dict() in_alt = 1 alternates[in_alt] = list() elif word == "<ALT>": in_alt += 1 alternates[in_alt] = list() else: # todo: we SHOULD add ALL the alternations into the tier # but we add only the first one... sppasCTM._add_alt_annotations(tier, alternates[1]) # re-init alternates = dict() in_alt = 0 else: ann = sppasCTM._create_annotation(begin, duration, word, score) if in_alt == 0: tier.add(ann) else: alternates[in_alt].append(ann)
def is_comment(line): """Check if the line is a comment, ie starts with ';;'. :param line: (str/unicode) :returns: boolean """ sp = sppasUnicode(line) line = sp.to_strip() return line.startswith(";;")
def unbind(self, utt): """ Unbind tokens containing - or ' or . depending on rules. :param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...) :returns: A list of strings """ new_utt = list() for tok in utt: is_unknown = self.__vocab.is_unk(tok.lower().strip()) is_sampa = tok.startswith('/') and tok.endswith('/') is_trunc = tok.endswith('-') # a missing compound word? # --> an unknown token # --> containing a special character # --> that is not a truncated word # --> not in a sampa sequence! if is_unknown is True \ and ("-" in tok or "'" in tok or "." in tok) \ and is_sampa is False\ and is_trunc is False: # KEEP special chars in the array! tab_split = re.split("([-'.])", tok) tab_tok = list(entry for entry in tab_split if len(entry) > 0) idx_start = 0 while idx_start < len(tab_tok): # use a longest matching to aggregate the current token with the next ones idx_end = min(len(tab_tok), idx_start + 5) phrase = " ".join(tab_tok[idx_start:idx_end]) idx_end, word = self.__stick_longest_lr( sppasUnicode(phrase).to_strip(), "") new_utt.append(word) idx_start += idx_end + 1 else: new_utt.append(sppasUnicode(tok).to_strip()) return new_utt
def get_phon_tokens(self, tokens, phonunk=True): """ Return the phonetization of a list of tokens, with the status. Unknown entries are automatically phonetized if `phonunk` is set to True. :param tokens: (list) The list of tokens to be phonetized. :param phonunk: (bool) Phonetize unknown words (or not). TODO: EOT is not fully supported. :returns: A list with the tuple (token, phon, status). """ tab = list() for entry in tokens: phon = self._pdict.get_unkstamp() status = OK_ID # Enriched Orthographic Transcription Convention: # entry can be already in SAMPA. if entry.startswith("/") is True and entry.endswith("/") is True: phon = entry.strip("/") # Must use SAMPA (including minus to separate phones) else: phon = self.get_phon_entry(entry) if phon == self._pdict.get_unkstamp(): status = ERROR_ID # A missing compound word? if "-" in entry or "'" in entry or "_" in entry: _tabpron = [self.get_phon_entry(w) for w in re.split("[-'_]", entry)] # OK, finally the entry is in the dictionary? if self._pdict.get_unkstamp() not in _tabpron: # ATTENTION: each part can have variants! must be decomposed. self._dag_phon.variants = 4 phon = sppasUnicode(self._dag_phon.decompose(" ".join(_tabpron))).to_strip() status = WARNING_ID if phon == self._pdict.get_unkstamp() and phonunk is True: try: phon = self._phonunk.get_phon(entry) status = WARNING_ID except Exception: phon = self._pdict.get_unkstamp() status = ERROR_ID tab.append((entry, phon, status)) return tab
def _readline(self, filename): """ Read the first line of filename, and return it as a unicode formatted string. """ line = "" try: with codecs.open(filename, 'r', encoding) as fp: sp = sppasUnicode(fp.readline()) line = sp.to_strip() except Exception: return "" return line
def check_data(self): """ Check the given data to be aligned (phones and tokens). :returns: A warning message, or an empty string if check is OK. """ if len(self._phones) == 0: raise IOError("No data to time-align.") phones = sppasUnicode(self._phones).to_strip().split() tokens = sppasUnicode(self._tokens).to_strip().split() if len(tokens) != len(phones): message = "Tokens alignment disabled: " \ "not the same number of tokens in tokenization (%d) " \ "and phonetization (%d)."\ % (len(self._tokens), len(self._phones)) self._tokens = " ".join( ["w_" + str(i) for i in range(len(self._phones))]) return message return ""
def unbind(self, utt): """ Unbind tokens containing - or ' or . depending on rules. :param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...) :returns: A list of strings """ new_utt = list() for tok in utt: is_unknown = self.__vocab.is_unk(tok.lower().strip()) is_sampa = tok.startswith('/') and tok.endswith('/') is_trunc = tok.endswith('-') # a missing compound word? # --> an unknown token # --> containing a special character # --> that is not a truncated word # --> not in a sampa sequence! if is_unknown is True \ and ("-" in tok or "'" in tok or "." in tok) \ and is_sampa is False\ and is_trunc is False: # KEEP special chars in the array! tab_split = re.split("([-'.])", tok) tab_tok = list(entry for entry in tab_split if len(entry) > 0) idx_start = 0 while idx_start < len(tab_tok): # use a longest matching to aggregate the current token with the next ones idx_end = min(len(tab_tok), idx_start + 5) phrase = " ".join(tab_tok[idx_start:idx_end]) idx_end, word = self.__stick_longest_lr(sppasUnicode(phrase).to_strip(), "") new_utt.append(word) idx_start += idx_end + 1 else: new_utt.append(sppasUnicode(tok).to_strip()) return new_utt
def replace(self, utt): """ Examine tokens and performs some replacements. A dictionary with symbols contains the replacements to operate. :param utt: (list) the utterance :returns: A list of strings """ # Specific case of float numbers sent = ' '.join(utt) sent = re.sub('([0-9])\.([0-9])', r'\1 NUMBER_SEP_POINT \2', sent) sent = re.sub('([0-9])\,([0-9])', r'\1 NUMBER_SEP \2', sent) sent = sppasUnicode(sent).to_strip() _utt = sent.split() # Other generic replacements _result = [] for s in _utt: if self.repl.is_key(s): s = s.replace(s, self.repl.replace(s)) _result.append(sppasUnicode(s).to_strip()) return _result
def remove(self, utt, wlist): """ Remove data of an utterance if included in a dictionary. Only used to remove punctuation. :param entry: :param wlist: (WordList) """ _utt = [] for tok in utt: tok = sppasUnicode(tok).to_strip() if wlist.is_unk(tok) is True and "gpd_" not in tok and "ipu_" not in tok: _utt.append(tok) return _utt
def lower(self, utt): """ Lower a list of strings. :param utt: (list) """ _utt = [] for tok in utt: # if it's not an already phonetized string: if "/" not in tok: _utt.append(sppasUnicode(tok).to_lower()) else: _utt.append(tok) return _utt
def is_pron_of(self, entry, pron): """ Return True if pron is a pronunciation of entry. Phonemes of pron are separated by "-". :param entry: (str) A unicode token to find in the dictionary :param pron: (str) A unicode pronunciation :returns: bool """ s = sppasDictPron.format_token(entry) if s in self._dict: p = sppasUnicode(pron).to_strip() return p in self._dict[s].split(sppasDictPron.VARIANTS_SEPARATOR) return False
def split(self, utt): """ Split an utterance using whitespace. If the language is character-based, split each character. :param utt: (str) an utterance of a transcription, a sentence, ... :param std: (bool) :returns: A list (array of string) """ s = utt if without_whitespace(self.__lang) is True: s = self.split_characters(s) toks = [] for t in s.split(): # if not a phonetized entry if t.startswith("/") is False and t.endswith("/") is False: if without_whitespace(self.__lang) is False: # Split numbers if stick to characters # attention: do not replace [a-zA-Z] by [\w] (because \w includes numbers) # and not on Asian languages: it can be a tone! t = re.sub(u'([0-9])([a-zA-Z])', ur'\1 \2', t) t = re.sub(u'([a-zA-Z])([0-9])', ur'\1 \2', t) # Split some punctuation t = re.sub(u'\\[\\]', ur'\\] \\[', t) # Split dots if stick to the beginning of a word # info: a dot at the end of a word is analyzed by the tokenizer t = re.sub(u' \.([\w-])', ur' . \1', t) t = re.sub(u'^\.([\w-])', ur' . \1', t) # Split replacement characters for r in self.__repl: if t.endswith(r): t = t[:-len(r)] t = t + ' ' + r toks.append(t.strip()) s = " ".join(toks) # Then split each time there is a space and return result s = sppasUnicode(s).to_strip() return s.split()
def read(dirname): """ Read a list file (start-time end-time). :param dirname: Name of the directory with the file to read. """ filename = os.path.join(dirname, ListIO.DEFAULT_FILENAME) with codecs.open(filename, 'r', encoding) as fp: lines = fp.readlines() _units = [] # Each line corresponds to a track, # with a couple 'start end' of float values. for line in lines: s = sppasUnicode(line) line = s.to_strip() _tab = line.split() if len(_tab) >= 2: _units.append((float(_tab[0]), float(_tab[1]))) return _units
def bind(self, utt): """ Bind tokens of an utterance using a specific character. :param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...) :returns: A list of strings """ new_utt = list() idx_start = 0 while idx_start < len(utt): # use a longest matching to aggregate the current token with the next ones idx_end = min(len(utt), idx_start+self.aggregate_max+1) phrase = " ".join(utt[idx_start:idx_end]) idx_end, word = self.__stick_longest_lr(sppasUnicode(phrase).to_strip(), self.separator) new_utt.append(word) idx_start += idx_end + 1 return new_utt
def load_from_ascii(self, filename): """ Load a pronunciation dictionary from an HTK-ASCII file. :param filename: (str) Pronunciation dictionary file name """ try: with codecs.open(filename, 'r', encoding) as fd: lines = fd.readlines() except Exception: raise FileIOError(filename) for l, line in enumerate(lines): uline = sppasUnicode(line).to_strip() # Ignore empty lines and check the number of columns if len(uline) == 0: continue if len(uline) == 1: raise FileFormatError(l, uline) # The entry is before the "[" and the pronunciation is after the "]" i = uline.find("[") if i == -1: i = uline.find(" ") entry = uline[:i] endline = uline[i:] j = endline.find("]") if j == -1: j = endline.find(" ") new_pron = endline[j+1:] # Phonetic variant of an entry (i.e. entry ends with (XX)) i = entry.find("(") if i > -1: if ")" in entry[i:]: entry = entry[:i] self.add_pron(entry, new_pron)
def get_phon_entry(self, entry): """ Return the phonetization of an entry. Unknown entries are not automatically phonetized. This is a pure dictionary-based method. :param entry: (str) The entry to be phonetized. :returns: A string with the phonetization of the given entry or the unknown symbol. """ entry = sppasUnicode(entry).to_strip() # Specific strings... for the italian transcription... # For the participation at the CLIPS-Evalita 2011 campaign. if entry.startswith(u("<")) is True and entry.endswith(u(">")) is True: entry = entry[1:-1] # No entry! Nothing to do. if len(entry) == 0: return "" # Specific strings used in the CID transcription... # CID is Corpus of Interactional Data, http://sldr.org/sldr000720 if entry.startswith(u("gpd_")) is True or entry.startswith(u("gpf_")) is True: return "" # Specific strings used in SPPAS IPU segmentation... if entry.startswith(u("ipu_")): return "" # Find entry in the dict as it is given _strphon = self._pdict.get_pron(entry) # OK, the entry is properly phonetized. if _strphon != self._pdict.get_unkstamp(): return self._map_phonentry(_strphon) return self._pdict.get_unkstamp()
def _tier2raw(self, tier, mapp=False): """ Return all interval contents into a single string. """ # Map phonemes from SAMPA to the expected ones. self._mapping.set_keep_miss(True) self._mapping.set_reverse(True) raw = "" for i, ann in enumerate(tier): if ann.GetLabel().IsEmpty() is True: logging.info("WARNING: Found an empty annotation label at index %d" % i) raw = raw + " sil" else: # if ann.GetLabel().IsSilence() is False: besttext = ann.GetLabel().GetValue() if mapp is True: besttext = self._mapping.map(besttext) if unk_stamp in besttext: besttext = besttext.replace(unk_stamp, "sil") raw = raw + " " + besttext return sppasUnicode(raw).to_strip()
def get_boundary(self, phonemes): """ Get the index of the syllable boundary (EXCRULES or GENRULES). :param phonemes: (str) Phonemes to syllabify :returns: (int) boundary index or -1 if phonemes does not match any rule. """ sp = sppasUnicode(phonemes) phonemes = sp.to_strip() phon_list = phonemes.split(" ") classes = "" for phon in phon_list: classes += self.get_class(phon) # search into exception if classes in self.exception: return self.exception[classes] # search into general for key, val in self.general.items(): if len(key) == len(phon_list): return val return -1
def load(self, filename): """ Load the rules using the file "filename". :param filename: (str) Name of the file with the syllabification rules. """ self.general = dict() # list of general rules self.exception = dict() # list of exception rules self.gap = dict() # list of gap rules self.phonclass = dict() # list of couples phoneme/classe with open(filename, "r") as file_in: for line_nb, line in enumerate(file_in, 1): sp = sppasUnicode(line) line = sp.to_strip() wds = line.split() if len(wds) == 3: if wds[0] == "PHONCLASS": self.phonclass[wds[1]] = wds[2] elif wds[0] == "GENRULE": self.general[wds[1]] = int(wds[2]) elif wds[0] == "EXCRULE": self.exception[wds[1]] = int(wds[2]) if len(wds) == 7: if wds[0] == "OTHRULE": s = " ".join(wds[1:6]) self.gap[s] = int(wds[6]) if len(self.general) < 4: raise IOError('Syllabification rules file corrupted. ' 'Got {:d} general rules, {:d} exceptions ' 'and {:d} other rules.'.format(len(self.general), len(self.exception), len(self.gap))) if "UNK" not in self.phonclass: self.phonclass["UNK"] = "#"
def read_palign(self, filename): """ Read an alignment file in the standard format of Julius CSR engine. :param filename: (str) The input file name. :returns: Two lists of tuples: - (start-time end-time phoneme score) - (start-time end-time word score) """ _phonalign = [] _wordalign = [] phonidx = -1 # phoneme index loc_s = 0. # phoneme start time loc_e = 0. # phoneme end time phonlist = [] wordseq = [] scores = [0] tokens = [""] wordlist = [] with codecs.open(filename, 'r', encoding) as fp: lines = fp.readlines() for line in lines: # Each line is either a new annotation or nothing interesting! line = sppasUnicode(line).to_strip() if line.startswith("=== begin forced alignment ==="): phonidx = 0 elif line.startswith("=== end forced alignment ==="): phonidx = -1 elif line.startswith("phseq1:"): s = sppasUnicode(line[7:]) line = s.to_strip() wordseq = line.split('|') # get indexes of each word wordlist = [] _idx = -1 for w in wordseq: _wrdphseq = w.strip().split() _idx += len(_wrdphseq) wordlist.append(_idx) # get the list of phonemes (without word segmentation) line = line.replace('|', '') line = sppasUnicode(line).to_strip() phonlist = line.split() elif line.startswith('cmscore1:'): line = line[9:] # confidence score of the pronunciation of each token scores = [float(s) for s in line.split()] if len(scores) == 0: scores = [0] elif line.startswith('sentence1:'): line = line[10:] # each token tokens = line.split() if len(tokens)==0: tokens = [""] elif line.startswith('[') and phonidx > -1: # New phonemes line = line.replace("[", "") line = line.replace("]", "") line = sppasUnicode(line).to_strip() tab = line.split(" ") # tab 0: first frame # tab 1: last frame # tab 2: score of the segmentation (log proba) # tab 3: triphone used loc_s = (float(tab[0]) / 100.) loc_e = (float(tab[1]) / 100.) if len(tab)>3: # Put real phoneme instead of triphones _phonalign.append([loc_s, loc_e, phonlist[phonidx], tab[2]]) else: _phonalign.append([loc_s, loc_e, "", tab[2]]) phonidx = phonidx+1 # Adjust time values and create wordalign wordidx = 0 # word index wordloc_s = 0. # word start time loc_e = 0. nextloc_s = 0. for phonidx in range(len(_phonalign)): # Fix the end of this annotation to the begin of the next one. loc_e = _phonalign[phonidx][1] if phonidx < (len(_phonalign)-1): # some hack because julius has a tendency to always be... ahead! nextloc_s = _phonalign[phonidx+1][0] + 0.01 _phonalign[phonidx+1][0] = nextloc_s else: nextloc_s = 0. if loc_e < nextloc_s: loc_e = nextloc_s _phonalign[phonidx][1] = loc_e # Override the segmentation score of the phone by # the score of the pronunciation of the word _phonalign[phonidx][3] = scores[wordidx] # add also the word? if phonidx == wordlist[wordidx]: _wordalign.append([wordloc_s, loc_e, tokens[wordidx], scores[wordidx]]) wordidx = wordidx + 1 wordloc_s = loc_e # last word, or the only entry in case of empty interval... if len(wordseq)-1 == wordidx: _wordalign.append([wordloc_s, loc_e, tokens[wordidx-1], scores[wordidx-1]]) return _phonalign,_wordalign
def run_alignment(self, inputwav, outputalign, N=3): """ Execute the external program `julius` to align. The data related to the unit to time-align need to be previously fixed with: - set_phones(str) - set_tokens(str) :param inputwav: (str - IN) the audio input file name, of type PCM-WAV 16000 Hz, 16 bits :param outputalign: (str - OUT) the output file name :param N: (int) N value of N-grams, used only if SLM (i.e. outext=walign) :returns: (str) A message of `julius`. """ outputalign = outputalign + "." + self._outext basename = os.path.splitext(inputwav)[0] if self._outext == "palign": self.gen_grammar_dependencies(basename) else: self.gen_slm_dependencies(basename) self.run_julius(inputwav, basename, outputalign) with codecs.open(outputalign, 'r', encoding) as f: lines = f.readlines() errorlines = "" message = "" entries = [] for line in lines: if line.find("Error: voca_load_htkdict") > -1 and line.find("not found") > -1: line = sppasUnicode(line).to_strip() line = line[line.find('"')+1:] line = line[:line.find('"')] if len(line) > 0: entries = line.split() if len(entries) > 0: added = self.add_tiedlist(entries) if len(added) > 0: message = "The acoustic model was modified. " \ "The following entries were successfully added into the tiedlist: " message = message + " ".join(added) + "\n" self.run_julius(inputwav, basename, outputalign) with codecs.open(outputalign, 'r', encoding) as f: lines = f.readlines() for line in lines: if (line.startswith("Error:") or line.startswith("ERROR:")) and " line " not in line: errorlines = errorlines + line if "search failed" in line: message = "Julius search has failed to find the transcription in the audio file. " errorlines = "Search error. " + errorlines if len(errorlines) > 0: raise Exception(message + errorlines) return message
def toe_spelling(self, entry, std=False): """ Create a specific spelling from an Enriched Orthographic Transcription. :param entry: (str) the EOT string :param std: (bool) Standard spelling expected instead of the Faked one. :returns: (str) DevNote: Python’s regular expression engine supports Unicode. It can apply the same pattern to either 8-bit (encoded) or Unicode strings. To create a regular expression pattern that uses Unicode character classes for \w (and \s, and \b), use the “(?u)” flag prefix, or the re.UNICODE flag. """ # Ensure all regexp will work! _fentry = " " + u(entry) + " " if std is False: # Stick unregular Liaisons to the previous token _fentry = re.sub(u' =([\w]+)=', ur'-\1', _fentry, re.UNICODE) else: # Remove Liaisons _fentry = re.sub(u' =([\w]+)=', ur' ', _fentry, re.UNICODE) # Laughing sequences _fentry = re.sub(u"\s?@\s?@\s?", u" ", _fentry, re.UNICODE) # Laughing _fentry = re.sub(u"([\w\xaa-\xff]+)@", ur"\1 @", _fentry, re.UNICODE) _fentry = re.sub(u"@([\w\xaa-\xff]+)", ur"@ \1", _fentry, re.UNICODE) # Noises _fentry = re.sub(u"([\w\xaa-\xff]+)\*", ur"\1 *", _fentry, re.UNICODE) _fentry = re.sub(u"\*([\w\xaa-\xff]+)", ur"* \1", _fentry, re.UNICODE) # Transcriptor comment's: {comment} _fentry = re.sub(u'\\{[\s\w\xaa-\xff\-:]+\\}', ur'', _fentry, re.UNICODE) # Transcriptor comment's: [comment] _fentry = re.sub(u'\\[[\s\w\xaa-\xff\-:]+\\]', ur'', _fentry, re.UNICODE) if std is False: # Special elisions (remove parenthesis content) _fentry = re.sub(u'\\([\s\w\xaa-\xff\-\']+\\)', ur'', _fentry, re.UNICODE) else: # Special elisions (keep parenthesis content) _fentry = re.sub(u'\\(([\s\w\xaa-\xff\-]+)\\)', ur'\1', _fentry, re.UNICODE) # Morphological variants are ignored for phonetization (same pronunciation!) _fentry = re.sub(u'\s+\\<([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\>', ur' \1', _fentry, re.UNICODE) _fentry = re.sub(u'\s+\\{([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\}', ur' \1', _fentry, re.UNICODE) if std is False: # Special pronunciations (keep right part) _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff/-]+),([\s\w\xaa-\xff/]+)\\]', ur' \2', _fentry, re.UNICODE) else: # Special pronunciations (keep left part) _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff\\/-]+),[\s\w\xaa-\xff\\/]+\\]', ur' \1', _fentry, re.UNICODE) # Proper names: $ name ,P\$ _fentry = re.sub(u',\s?[PTS]+\s?[\\/\\\]+\s?\\$', ur'', _fentry, re.UNICODE) _fentry = re.sub(u'\\$', ur'', _fentry, re.UNICODE) # specific case with numbers _fentry = re.sub(u"\s(?=,[0-9]+)", "", _fentry, re.UNICODE) # ok, now stop regexp and work with unicode: _fentry = sppasUnicode(_fentry).to_strip() # Punctuations at the end of a token s = [] entries = _fentry.split() for i, c in enumerate(entries): # Check for the SAMPA sequence to assign properly "in_sampa" if c.startswith("/") and c.endswith('/'): in_sampa = True else: in_sampa = False # if not in_sampa, add a whitespace if some punctuations are stick to a word if in_sampa is False: # if there is a serie of punctuations at the beginning while len(c) > 0 and category(c[0])[0] in ('P', 'S'): s.append(c[0]) c = c[1:] # if there is a serie of punctuations at the end end_punct = [] while len(c) > 0 and category(c[-1])[0] in ('P', 'S'): end_punct.append(c[-1]) c = c[:-1] if len(end_punct) == 1 and end_punct[0] == u("."): s.append(c+u(".")) else: s.append(c) if len(end_punct) > 0: s.extend(reversed(end_punct)) else: if len(s) == 0: s.append(c) else: s[-1] += c return " ".join(s)
def read_walign(self, filename): """ Read an alignment file in the standard format of Julius CSR engine. :param filename: (str) The input file name. :returns: Two lists of tuples: - None - (start-time end-time word score) """ tokens = [""] scores = [0] _wordalign = [] wordidx = -1 with codecs.open(filename, 'r', encoding) as fp: lines = fp.readlines() for line in lines: # Each line is either a new annotation or nothing interesting! line = sppasUnicode(line).to_strip() if line.startswith("=== begin forced alignment ==="): wordidx = 0 elif line.startswith("=== end forced alignment ==="): wordidx = -1 elif line.startswith('wseq1:'): line = line[6:] # each token tokens = line.split() if len(tokens) == 0: tokens = [""] elif line.startswith('cmscore1:'): line = line[9:] # confidence score of the pronunciation of each token scores = [float(s) for s in line.split()] if len(scores) == 0: scores = [0] elif line.startswith('[') and wordidx > -1: # New phonemes line = line.replace("[", "") line = line.replace("]", "") line = sppasUnicode(line).to_strip() tab = line.split(" ") # tab 0: first frame # tab 1: last frame # tab 2: score of the segmentation (log proba) # tab 3: word loc_s = (float(tab[0]) / 100.) loc_e = (float(tab[1]) / 100.) _wordalign.append([loc_s, loc_e, tokens[wordidx], scores[wordidx]]) wordidx = wordidx+1 # Adjust time values for wordidx in range(len(_wordalign)): # Fix the end of this annotation to the begin of the next one. loc_e = _wordalign[wordidx][1] if wordidx < (len(_wordalign)-1): nextloc_s = _wordalign[wordidx+1][0] else: nextloc_s = 0. if loc_e < nextloc_s: loc_e = nextloc_s _wordalign[wordidx][1] = loc_e return None,_wordalign
def normalize(self, entry, actions=[]): """ Tokenize an utterance. :param entry: (str) the string to normalize :param actions: (list) the modules/options to enable. - "std": generated the standard orthography instead of the faked one - "replace": use a replacement dictionary - "tokenize": tokenize the entry - "numbers": convert numbers to their written form - "lower": change case of characters to lower - "punct": remove punctuation :returns: (str) the normalized entry Important: An empty actions list or a list containing only "std" means to enable all actions. """ _str = sppasUnicode(entry).to_strip() # Remove UTF-8 specific characters that are not in our dictionaries! for key in self.dicoutf: _str = _str.replace(key, self.dicoutf.replace(key)) # Clean the Enriched Orthographic Transcription ortho = sppasTranscription() _str = ortho.clean_toe(_str) if "std" in actions: _str = ortho.toe_spelling(_str, True) else: _str = ortho.toe_spelling(_str, False) # Split using whitespace or characters. splitter = sppasTokSplitter(self.lang, self.repl) utt = splitter.split(_str) # The entry is now a list of strings on which we'll perform actions # ----------------------------------------------------------------- if len(actions) == 0 or (len(actions) == 1 and "std" in actions): actions.append("replace") actions.append("tokenize") actions.append("numbers") actions.append("lower") actions.append("punct") if "replace" in actions: utt = self.replace(utt) if "tokenize" in actions: utt = self.tokenize(utt) if "numbers" in actions: utt = self.numbers(utt) if "lower" in actions: utt = self.lower(utt) if "punct" in actions: utt = self.remove(utt, self.punct) # Finally, prepare the result result = "" for s in utt: s = sppasUnicode(s).to_strip() result = result + " " + s.replace(" ", "_") result = sppasUnicode(result).to_strip() if len(result) == 0: return "" # Nothing valid! return result.replace(" ", self.delimiter)