def epi_test(): # Just look at Dutch phonemes epi = Epitran('nld-Latn') print(epi.transliterate('Werkt')) # Look at English phonemes first, also consider Dutch. # Here to see if flite works. backoff = Backoff(['eng-Latn', 'nld-Latn']) print(backoff.transliterate('Werkt'))
def compare_epi_to_dict(): dict_file = open('g2p_dictionary/dutch_dic_to_phonetic.1.json', 'r') g2p_dict = json.load(dict_file) epi = Epitran('nld-Latn') n = 0 for key in g2p_dict: dic_word = g2p_dict[key] epi_word = epi.transliterate(key) if (dic_word != epi_word): n += 1 # fstring = "Key: %s\nDic: %s\nEpi: %s\n" % (key, dic_word, epi_word) # print (fstring) print(n)
def __init__(self, code, space_names): """Constructs VectorWithIPASpace object A VectorWithIPASpace object takes orthographic words, via the word_to_segs method, and returns a list of tuples consisting of category (letter or punctuation), lettercaase, orthographic form, phonetic form, id within an IPA space, and articulatory feature vector. Args: code (str): ISO 639-3 code joined to ISO 15924 code with "-" space_names (list): list of space names consisting of ISO 639-3 codes joined to ISO 15924 codes with "-" """ self.epi = Epitran(code) self.space = Space(code, space_names)
def get_tgt_code_and_transcriber( target: str, pron_dict: Optional[dict] = None, need_transcriber: bool = True) -> Tuple[str, G2P_func]: if target == 'roa-opt': tgt_code = 'roa_opt' else: tgt_code = lookup(target).alpha_3 if not need_transcriber: tgt_g2p = None # Use epitran. elif pron_dict is None: if tgt_code in [ 'ita', 'spa', 'por', 'fra', 'cat', 'ron', 'deu', 'nld', 'swe' ]: epi_code = f'{tgt_code}-Latn' else: raise ValueError(f'language {target} not supported.') tgt_g2p = Epitran(epi_code).transliterate # Use pronunciation dictionary. else: # Return None if entry not found. tgt_g2p = lambda token: pron_dict.get((tgt_code, token), None) return tgt_code, tgt_g2p
def get_data_stream(self, min_count=1, min_length=1): with open(self._file, 'r') as text, \ open(self._dict_file, 'r') as g2p_file: g2p_dict = json.load(g2p_file) g2p_dict = {k.lower():v for k,v in g2p_dict.items()} epi = Epitran(self._epi_code) for line in text: list_phonetic_words = self.transliterate_line(line, g2p_dict, epi) if (self.validate_line(list_phonetic_words, min_count, min_length)): yield (list_phonetic_words)
def set_corpus(self): with open(self._file, 'r') as text, \ open(self._dict_file, 'r') as g2p_file: g2p_dict = json.load(g2p_file) g2p_dict = {k.lower():v for k,v in g2p_dict.items()} epi = Epitran(self._epi_code) for line in text: line_phonetic = self.transliterate_line(line, g2p_dict, epi) for word in line_phonetic: self._count_table[word] += 1
def main(): parser = argparse.ArgumentParser(prog="words2ipa.py") parser.add_argument("language", help="epitran language code (e.g., eng-Latn)") parser.add_argument( "--print-word", action="store_true", help="Print word before IPA" ) parser.add_argument("--sep", help="Separator between IPA symbols (default: none)") args = parser.parse_args() e = Epitran(args.language) print("Reading words from stdin...", file=sys.stderr) for word in sys.stdin: word = word.strip() if word: if args.print_word: print(word, end=" ") ipa = e.trans_list(word) if args.sep: print(args.sep.join(ipa)) else: print("".join(ipa))
class VectorsWithIPASpace(object): def __init__(self, code, space_names): """Constructs VectorWithIPASpace object A VectorWithIPASpace object takes orthographic words, via the word_to_segs method, and returns a list of tuples consisting of category (letter or punctuation), lettercaase, orthographic form, phonetic form, id within an IPA space, and articulatory feature vector. Args: code (str): ISO 639-3 code joined to ISO 15924 code with "-" space_names (list): list of space names consisting of ISO 639-3 codes joined to ISO 15924 codes with "-" """ self.epi = Epitran(code) self.space = Space(code, space_names) def word_to_segs(self, word, normpunc=False): """Returns feature vectors, etc. for segments and punctuation in a word Args: word (unicode): Unicode string representing a word in the orthography specified when the class is instantiated normpunc (bool): normalize punctuation Returns: list: a list of tuples, each representing an IPA segment or a punctuation character. Tuples consist of <category, lettercase, orthographic_form, phonetic_form, id, feature_vector>. Category consists of the standard Unicode classes (e.g. 'L' for letter and 'P' for punctuation). Case is binary: 1 for uppercase and 0 for lowercase. """ segs = self.epi.word_to_tuples(word, normpunc) new_segs = [] for cat, case, orth, phon, id_vec_list in segs: if not phon and normpunc: if orth in self.epi.puncnorm: orth = self.epi.puncnorm[orth] for s, vector in id_vec_list: if s in self.space: id_ = int(self.space[s]) elif orth in self.space: id_ = int(self.space[orth]) else: id_ = -1 new_segs.append((cat, case, orth, phon, id_, vector)) return new_segs
def __init__(self, code, space_names): """Construct a Space object Space objects take strings (corresponding to segments) and return integers, placing them in an integer space that can be translated into a one-hot vector. The resulting object has a dictionary-like interface that supports indexing and iteration over "keys". Args: code (str): ISO 639-3 code joined to ISO 15924 code with "-" space_names (list): list of space names consisting of ISO 639-3 codes joined to ISO 15924 codes with "-" """ self.epi = Epitran(code) self.dict = self._load_space(space_names)
def ger_to_ipa(text: str) -> str: if Language.GER not in _epitran_cache.keys(): _epitran_cache[Language.GER] = Epitran('deu-Latn') result = _epitran_cache[Language.GER].transliterate(text) return result
def en_to_ipa(text: str) -> str: if Language.ENG not in _epitran_cache.keys(): _epitran_cache[Language.ENG] = Epitran('eng-Latn') result = _epitran_cache[Language.ENG].transliterate(text) return result
from epitran import Epitran from util import lang2ISO import codecs import argparse if __name__ == '__main__': parser = argparse.ArgumentParser("Clean Data") parser.add_argument('fn', metavar='fn') parser.add_argument('lang', metavar='lang') args = parser.parse_args() fn = args.fn lang = args.lang with codecs.open(fn, "r", encoding='utf-8') as file: lines = [l.strip().split('\t') for l in\ file] iso = lang2ISO(lang) epi = Epitran(iso) for lemma, wf, tags in lines: if lemma.isdigit() or wf.isdigit(): print("Digit! %s, %s" % (lemma, wf)) elif epi.transliterate(lemma) and epi.transliterate(wf): pass else: print("Cannot transliterate! %s, %s" % (lemma, wf))
to_rectify = [('g', 'ɡ'), ('gʷ', 'ɡʷ'), ('h', 'x'), ('hʷ', 'xʷ'), ('ɛ', 'e'), ('ɣ', 'ɡ'), ('ɔ', 'o')] non_transcriber = OldNorseTranscription() desc[ipa_col] = desc[form_col].apply( lambda s: non_transcriber.transcribe(s).strip('[]')).apply( i2t).apply(lambda lst: [replace(x, to_rectify) for x in lst]) elif lang in ['it', 'es', 'fr', 'uk', 'pl', 'ru']: lang2code = { 'it': 'ita-Latn', 'es': 'spa-Latn', 'fr': 'fra-Latn', 'ru': 'rus-Cyrl', 'uk': 'ukr-Cyrl', 'pl': 'pol-Latn' } transcriber = Epitran(lang2code[lang]) ipa_col = f'{lang}_ipa' form_col = 'desc_form' # Italian doesn't have phonemic diphthongs. merge_vowels = lang != 'it' desc[ipa_col] = desc[form_col].apply( lambda s: i2t(transcriber.transliterate(s).replace('ˈ', ''). replace('ˌ', '').replace("'", ''), merge_vowels=merge_vowels)) to_normalize = list() if lang == 'ru': to_normalize = [('á', 'a'), ('ó', 'o'), ('é', 'e'), ('ú', 'u'), ('ɨ́', 'ɨ'), ('í', 'i'), ('t͡ɕʲ', 't͡ɕ'), ('ʂʲ', 'ʂ')] elif lang == 'uk': to_normalize = [('ɑ́', 'ɑ'), ('ɔ́', 'ɔ'), ('ɛ́', 'ɛ'), ('í', 'i'),