def __init__(self, lang='es', text_analysis=None): try: if text_analysis is None: self.ta = TextAnalysis(lang=lang) else: self.ta = text_analysis file_lexicon = DIR_INPUT + 'NRC-VAD-Lexicon.txt' file_word_embedding_en = DIR_MODELS + 'word_embedding_en.model' file_word_embedding_es = DIR_MODELS + 'word_embedding_es.model' file_syllable_embedding_en = DIR_MODELS + 'syllable_embedding_en.model' file_syllable_embedding_es = DIR_MODELS + 'syllable_embedding_es.model' file_phoneme_embedding_en = DIR_MODELS + 'phoneme_embedding_en.model' file_phoneme_embedding_es = DIR_MODELS + 'phoneme_embedding_es.model' print('Loading Lexicons and Embedding.....') if lang == 'es': epi = epitran.Epitran('spa-Latn') lexicon = self.ta.import_lexicon_vad(file_lexicon, lang=lang) word_embedding = Word2Vec.load(file_word_embedding_es) syllable_embedding = Word2Vec.load(file_syllable_embedding_es) phoneme_embedding = Word2Vec.load(file_phoneme_embedding_es) else: epi = epitran.Epitran('eng-Latn') lexicon = self.ta.import_lexicon_vad(file_lexicon, lang=lang) word_embedding = Word2Vec.load(file_word_embedding_en) syllable_embedding = Word2Vec.load(file_syllable_embedding_en) phoneme_embedding = Word2Vec.load(file_phoneme_embedding_en) self.epi = epi self.lexicon = lexicon self.word_embedding = word_embedding self.syllable_embedding = syllable_embedding self.phoneme_embedding = phoneme_embedding except Exception as e: Utils.standard_error(sys.exc_info()) print('Error FeatureExtraction: {0}'.format(e))
class FicheEng(CardLanguage): #Phonetic translation eng = epitran.Epitran('eng-Latn') #Apply phonetic translation def translate(self, word): self.phonetic = eng.transliterate(unicode(word, 'utf-8'))
class FicheSp(CardLanguage): #Phonetic translation sp = epitran.Epitran('spa-Latn') #Apply phonetic translation def translate(self, word): self.phonetic = sp.transliterate(unicode(word, 'utf-8'))
def __init__(self): import epitran print(".. load rus-Cyrl") self.epi = epitran.Epitran('rus-Cyrl') self.target_file = f'{cf.conf_dir}/langs/voc/ru-map.json' self.target_file_rev = f'{cf.conf_dir}/langs/voc/ru-rev-map.json'
def __init__(self, lang): lang_ipa = {'es': 'spa-Latn', 'en': 'eng-Latn'} lang_stemm = {'es': 'spanish', 'en': 'english'} self.lang = lang self.stemmer = SnowballStemmer(language=lang_stemm[lang]) self.epi = epitran.Epitran(lang_ipa[lang]) self.nlp = self.load_sapcy(lang)
def gettrans(): lang = request.args.get('lang', 'eng-Latn', type=str) textin = request.args.get('textin', '', type=str) epi = epitran.Epitran(lang, cedict_file='data/cedict_1_0_ts_utf-8_mdbg.txt') trans = ' '.join([epi.transliterate(w) for w in textin.split(' ')]) return trans
class FicheFr(CardLanguage): #Phonetic translation fr = epitran.Epitran('fra-Latn') #Apply phonetic translation def translate(self, word): self.phonetic = fr.transliterate(unicode(word, 'utf-8'))
def main(code): epi = epitran.Epitran(code) for line in sys.stdin: # pointless line = line.decode('utf-8') line = unicodedata.normalize('NFD', line.lower()) line = epi.transliterate(line) line = line.encode('utf-8') sys.stdout.write(line)
def main(fn): epi = epitran.Epitran('uig-Arab') vwis = epitran.vector.VectorsWithIPASpace('uig-Arab', ['uig-Arab']) tree = etree.parse(fn) root = tree.getroot() for token in root.findall('.//TOKEN'): # print(token.text.encode('utf-8')) print(epi.transliterate(unicode(token.text)).encode('utf-8'))
def main(mode, fn): epi = epitran.Epitran(mode) with open(fn, encoding='utf-8') as f: for line in f: line = line.strip() a, b, gloss = line.split('\t') ipa = epi.transliterate(a) print('\t'.join([ipa, a, b, gloss]))
def convert_to_ipa(texts): epi = epitran.Epitran('eng-Latn') for text_mel_pair in texts: text_mel_pair[1] = ipa.convert(text_mel_pair[1]) foreign_words = re.findall(r"[^ ]{0,}\*", text_mel_pair[1]) for word in foreign_words: text_mel_pair[1] = text_mel_pair[1].replace( word, epi.transliterate(word[0:len(word) - 1]))
def detect_languages(langs, trans): with open("./text_logger/languges.json", "r") as read_file: lg = json.load(read_file) with open("./text_logger/languges_for_transcription.json", "r") as read_file: tc = json.load(read_file) rs = [] nd = [] global epis def add_print(langu): print_on_magenta(f'---> added {langu} language') def add2_print(langu, is_not_sup): if is_not_sup: print(f'\tlanguage {langu} will be trancripted (with limited support)') else: print(f'\tlanguage {langu} will be trancripted') for lang, need in zip(langs,trans): f = False if lang in lg.values(): rs.append(lang) f = True add_print(lang) elif lang in lg.keys(): rs.append(lg[lang]) f = True add_print(lang) else: for k in lg.keys(): if k.startswith(lang): rs.append(lg[k]) add_print(k) f = True break if not f: print_on_red(f"I donna this language: '{lang}'. See json file to correct it") else: nd.append(need) itlang = rs[len(rs)-1] epitran_lang = [key for key, _ in tc.items() if key.startswith(itlang)][0] if need: epis[itlang] = epitran.Epitran(epitran_lang) add2_print(*tc[epitran_lang]) if len(rs) == 0: print_on_red('There are no correct languages in ur list. See json file to correct it') return rs, nd
def convert_to_ipa(texts): print("Converting training files to IPA notation...") epi = epitran.Epitran('eng-Latn', ligatures=True) for text_mel_pair in texts: text_mel_pair[1] = ipa.convert(english_cleaners(text_mel_pair[1])) foreign_words = re.findall(r"[^ ]{0,}\*", text_mel_pair[1]) for word in foreign_words: text_mel_pair[1] = text_mel_pair[1].replace( word, epi.transliterate(word[0:len(word) - 1]))
def main(code, op, infiles, output): epi = epitran.Epitran(code) ft = panphon.FeatureTable() space = Counter() for fn in infiles: logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8')) add_file = add_file_op if op else add_file_gen space.update(add_file(epi, ft, fn)) print_space(output, space)
def generate_from_file(tacotron2_path, waveglow_path, text_file, output_directory): # Make synthesis paths if not os.path.exists(output_directory): os.makedirs(output_directory) print("Creating directory " + output_directory + "...") hparams = create_hparams() hparams.sampling_rate = 22050 print("Loading models...") model = load_model(hparams) model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) genlist = [] with open(text_file) as file: for line in file: genlist.append(line.strip()) for entry in genlist: wav_name = "_".join(entry.split(" ")[:4]).lower() + ".wav" epi = epitran.Epitran('eng-Latn', ligatures = True) if hparams.preprocessing == "ipa": entry = ipa.convert(english_cleaners(entry)) foreign_words = re.findall(r"[^ ]{0,}\*", entry) for word in foreign_words: entry = entry.replace(word, epi.transliterate(word[0:len(word)-1])) if hparams.preprocessing == "arpabet": entry = make_arpabet(entry) # Text sequencer if hparams.preprocessing is not None: sequence = np.array(text_to_sequence(entry, None))[None, :] else: sequence = np.array(text_to_sequence(entry, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # Synthesis mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_denoised = denoiser(audio, strength=0.01)[:, 0] # Save audio print ("Saving " + wav_name) write(os.path.join(output_directory, wav_name), hparams.sampling_rate, audio_denoised[0].data.cpu().numpy())
def main(unused_argv): if not FLAGS.language_id: raise ValueError("Specify --language_id!") logging.info("Initializing Epitran for \"%s\" ...", FLAGS.language_id) epi = epitran.Epitran(FLAGS.language_id) logging.info("Processing Bible ...") reader = BibleEpitranReader(epi) reader.read()
def main(mode, fnin, fnout): epi = epitran.Epitran(mode) tree = etree.parse(fnin) root = tree.getroot() with open(fnout, 'w', encoding='utf-8') as fout: writer = csv.writer(fout, dialect='excel-tab') for entry in root.xpath('//ENTRY'): lemma = entry.find('LEMMA').text gloss = entry.find('GLOSS').text writer.writerow([epi.transliterate(lemma), gloss])
def main(fn): epi = epitran.Epitran('ori-Orya') with open(fn, encoding='utf-8') as f: reader = csv.reader(f, dialect='excel-tab') for lemma, input, props, gloss in reader: props = props.replace(' ', '+') gold_analysis = lemma + props phonemic_input = epi.transliterate(input) print('\t'.join( [input, phonemic_input, lemma, gold_analysis, gloss]))
def test_quick(self): y = u"p͡f" epi = epitran.Epitran('eng-Latn') y = epi.transliterate( "At Müller's execution there was great competition for front seats," ) #y += " ɡɹât͡ʃi" y += "?" res = extract_from_sentence(y, ignore_tones=True, ignore_arcs=True) print(res)
def init(): """Init all the phonemisers.""" languages = [ p.name for p in pathlib.Path("lib/data/phon/", ).glob("*") if not p.name == "README.md" ] lookup_tables = {} # If we have Epitran for language in iso_2to3: if language.startswith("zh-"): lookup_tables[language] = epitran.Epitran( iso_2to3[language], cedict_file="lib/data/dict/zh", ) else: lookup_tables[language] = epitran.Epitran(iso_2to3[language]) # Otherwise fallback to TSV-style for language in languages: if language == "zh": continue if language.startswith("ja-"): continue lines = open("lib/data/phon/" + language).readlines() if len(lines) == 0: continue lookup_tables[language] = {} for line in lines: if line.strip() == "": continue kv = line.strip().split("\t") if len(kv) != 2: print("!", kv, file=sys.stderr) continue k = kv[0].strip() v = kv[1].strip() if k not in lookup_tables[language]: lookup_tables[language][k] = [] lookup_tables[language][k].append(v) return lookup_tables
def create_epitran_dict(): """Return a dictionary of languages to Epitran Objects.""" codes = pd.read_csv(SUPPORTED_LANGS_PATH, sep='\t', header=0, error_bad_lines=False)['Code'] epitran_dict = {} for code in codes: if code[:3] in epitran_dict: continue try: epitran_dict[code[:3]] = epitran.Epitran(f'{code}') except OSError: continue return epitran_dict
def read_input(input_, langscript): space = set() epi = epitran.Epitran(langscript) ft = panphon.featuretable.FeatureTable() for dirname in input_[0]: for fn in glob.glob(os.path.join(dirname, '*.ltf.xml')): for token in read_tokens(fn): ipa = epi.transliterate(token) for seg in ft.segs_safe(ipa): space.add(seg) return space
def to_ipa(fname, lang1, lang2): epitran_map = {} with open(map_file, "r", encoding="utf-8") as f: for line in f: tks = line.strip().split("\t") epitran_map[tks[1]] = tks[0] epi1 = epitran.Epitran(epitran_map[lang1]) epi2 = epitran.Epitran(epitran_map[lang2]) fsave = fname + ".ipa" with open(fname, "r", encoding="utf-8") as f, open(fsave, "w+", encoding="utf-8") as fout: for line in f: tks = line.strip().split(" ||| ") if len(tks) < 3: continue _tks = [x for x in tks] tks[1] = epi1.transliterate(tks[1]) tks[2] = epi2.transliterate(tks[2]) # tks = tks[:3] + _tks[2:3] + tks[3:] fout.write(" ||| ".join(tks) + "\n")
def transcribe_song_fr(text): import epitran import codecs epi = epitran.Epitran('fra-Latn') # f = codecs.open(filename, 'r', 'utf8') # text = f.read() # f.close() transcribed_song = epi.transliterate(text) print(text, transcribed_song) return transcribed_song
def _phonemize(text, language): try: seperators = Separator(word=' ', phone='') phonemes = phonemize(text, separator=seperators, backend='espeak', language=language) except RuntimeError: epi = epitran.Epitran(language) phonemes = epi.transliterate(text, normpunc=True) phonemes.replace('\n', ' ', 1) return phonemes
def __init__(self, code, table, decompose=True, cedict_file=None): """Construct object for re-romanizing Epitran output. This class converts orthographic input, via Epitran, to a more conventional romanization that should be more readable to most humans. Args: code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen table (str): Name of re-romanization table decompose (bool): apply decomposing normalization """ self.epi = epitran.Epitran(code, cedict_file=cedict_file) self.mapping = self._load_reromanizer(table, decompose)
def multip_write_features_to_csv(): lexicon = 'saldo' corpus = 'news' gold_blends = blend_keys() wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1' wsm = gs.models.Word2Vec.load(wg_path) cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext' csm = gs.models.Word2Vec.load(cg_path) epit = epitran.Epitran('swe-Latn') csvf = open('{0}_features_overlap_split_020818.csv'.format(lexicon), '+w', newline='') csvw = csv.writer(csvf, delimiter=',') T, F = 0, 0 dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle' with open(dataf, 'rb') as f: freqd = pickle.load(f) # overlap candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/' # noverlap #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/' cand_set = [] for i, filename in enumerate(listdir(candidate_folder)): blend = filename.split('_')[0] #print('#', i ,'reading', blend, 'from', candidate_folder+filename) with open(candidate_folder + filename) as f: for ln in f: cw1, cw2 = ln.rstrip().split(',') if blend in [cw1, cw2]: continue sw1, sw2 = gold_blends[blend] cand_set.append((blend, cw1, cw2, lexicon, corpus, sw1, sw2, freqd, csm, wsm, epit)) for cand_chunk in chunks(cand_set, 10): with Pool(3) as p: entires = p.starmap(extract_sample_features, cand_chunk) print('# writing entries') for entry in entires: for e in entry: csvw.writerow(list(map(lambda x: str(x), e[0].values()))) csvf.close()
def result(): db=sqlite3.connect('coarsewords.db') wordsearch = request.args.get('word') langsearch = request.args.get('lang') import epitran epi = epitran.Epitran(epitran_langs[langsearch]) wordipa = '' try: wordipa = epi.transliterate(wordsearch) except KeyError: pass res = db.execute(" select word, categories, phonetics, definition, etymology, langs from coarseword WHERE word='"+wordsearch+"' or phonetics='"+wordipa+"'") words = res.fetchall() return render_template('result.html', words=words)
def main(fnin): epi = epitran.Epitran('hin-Deva') st = mstem.Stemmer('ben-IPA', ['../lexicons/ben.tsv']) with open(fnin, encoding='utf-8') as fin: for line in fin: line = line.strip() for token in line.split(' '): ipa = epi.transliterate(token) parse = st.gloss(ipa) lemma = parse[0] morph = '+'.join(parse[1:]) morph = morph if morph else '<unk>' print('w:{}~l:{}~m:{}~ipa:{}'.format(token, lemma, morph, ipa)) print('')
def to_phoneme_de(self, language='deu-Latn'): epi = epitran.Epitran(language) # lang='deu-Latn' oder 'eng-Latn' for line in self.data: temp = '' for word in line['s'].split(): #print(word) try: phoneme = epi.transliterate(word) print(phoneme) temp = temp + phoneme + ' ' #print(type(phoneme), phoneme, word) except: temp += word.join(' ') #print(temp) line['s'] = temp.strip()