def rus_to_phonemes(text): text = text.replace('-', '—') text = text.replace('…', '.') punctuation_marks = ';:,.!?¡¿—…"«»“”()' transcriptor = Transcription() phonemes = '' queue = Queue() for i in text: if i in punctuation_marks: queue.put(' ' + i + ' ') for it in transcriptor.transcribe([text]): for seq in it: if queue.empty(): phonemes += ' '.join(seq) continue phonemes += ' '.join(seq) + queue.get() return phonemes.split()
def __init__(self, path_to_w2v='modelphonemes.model', path_to_annoy='annoy_index.ann', path_to_dict='data.pkl'): self.your_transcriptor = Transcription() self.your_accentor = Accentor() if os.path.isfile(path_to_w2v): self.model = gensim.models.Word2Vec.load(path_to_w2v) else: raise IOError("File {} does not exist!".format(path_to_w2v)) if os.path.isfile(path_to_dict): with open(path_to_dict, 'rb') as f: self.dict_of_acc = pickle.load(f) else: raise IOError("File {} does not exist!".format(path_to_dict)) self.accents = list(self.dict_of_acc.keys()) f = len(self.accents[0]) self.t = AnnoyIndex(f, 'hamming') if os.path.isfile(path_to_annoy): self.t.load(path_to_annoy) else: raise IOError("File {} does not exist!".format(path_to_annoy))
class RussianG2P: def __init__(self, word_separator="\t"): self.transcriptor = Transcription() self.word_separator = word_separator def __call__(self, graphemes_batch, return_word_to_phonemes_dictionary=False): processed_batch = [] word_to_phonemes_dictionary = {} for graphemes in graphemes_batch: words = [] phonemes = [] word = "" for grapheme in graphemes + [self.word_separator]: if grapheme == self.word_separator: if word.isalpha(): words.append(word) word = "" else: word += grapheme g2p_output = self.transcriptor.transcribe(words) for word, word_phonemes in zip(words, g2p_output): if not word_phonemes: continue # Some words have different variant to pronounce, # We are selecting one of them word_phonemes = word_phonemes[0] phonemes += word_phonemes + [self.word_separator] if return_word_to_phonemes_dictionary: word_to_phonemes_dictionary[word] = word_phonemes processed_batch.append(phonemes) if return_word_to_phonemes_dictionary: return processed_batch, word_to_phonemes_dictionary return processed_batch
class TestAll(unittest.TestCase): def setUp(self): self.__transcription = Transcription() def tearDown(self): del self.__transcription def test_normal(self): source_phrase = 'Мама мыла раму' target_variants = [[ 'M', 'A0', 'M', 'A', 'M', 'Y0', 'L', 'A', 'R', 'A0', 'M', 'U' ]] real_variants = self.__transcription.transcribe([source_phrase])[0] self.assertEqual(target_variants, real_variants) def test_symbols(self): source_phrase = 'Мама мыла ра-му, а ты?! - Нет.' target_variants = [[ 'M', 'A0', 'M', 'A', 'M', 'Y0', 'L', 'A', 'R', 'A0', 'M', 'U0' ], ['A', 'T', 'Y0'], ['N0', 'E0', 'T']] real_variants = self.__transcription.transcribe([source_phrase])[0] self.assertEqual(target_variants, real_variants) def test_nothing(self): source_phrase = '...' target_variants = [] real_variants = self.__transcription.transcribe([source_phrase])[0] self.assertEqual(target_variants, real_variants) def test_begin(self): source_phrase = '- Ага' target_variants = [['A', 'G', 'A0']] real_variants = self.__transcription.transcribe([source_phrase])[0] self.assertEqual(target_variants, real_variants) def test_accented(self): source_phrase_1 = 'диалог был' real_variants_1 = self.__transcription.transcribe([source_phrase_1])[0] source_phrase_2 = 'диало+г бы+л' real_variants_2 = self.__transcription.transcribe([source_phrase_2])[0] self.assertEqual(real_variants_1, real_variants_2)
def __init__(self, word_separator="\t"): self.transcriptor = Transcription() self.word_separator = word_separator
def setUp(self): self.__transcription = Transcription()
class PhoneticIndex(object): def __init__(self, path_to_w2v='modelphonemes.model', path_to_annoy='annoy_index.ann', path_to_dict='data.pkl'): self.your_transcriptor = Transcription() self.your_accentor = Accentor() if os.path.isfile(path_to_w2v): self.model = gensim.models.Word2Vec.load(path_to_w2v) else: raise IOError("File {} does not exist!".format(path_to_w2v)) if os.path.isfile(path_to_dict): with open(path_to_dict, 'rb') as f: self.dict_of_acc = pickle.load(f) else: raise IOError("File {} does not exist!".format(path_to_dict)) self.accents = list(self.dict_of_acc.keys()) f = len(self.accents[0]) self.t = AnnoyIndex(f, 'hamming') if os.path.isfile(path_to_annoy): self.t.load(path_to_annoy) else: raise IOError("File {} does not exist!".format(path_to_annoy)) def transform(self, sentence, acc_number=10, sent_number=1): assert acc_number >= sent_number, "number of variants for nearest neighbors should be bigger than number of nearest sentences" phonemes = self.get_phonemes(sentence) accents = self.get_accents(sentence) closest_vectors = self.get_closest_vecs(accents, number=acc_number) closest_sentences = self.get_embeddings(closest_vectors, phonemes, number=sent_number) return closest_sentences def get_phonemes(self, sentence): # выдает транскрипцию with warnings.catch_warnings(): warnings.simplefilter('ignore') new_sentence = self.transcriptor(sentence) text = [] for string in new_sentence[0]: for phoneme in string: text.append(phoneme) if len(text) != 0: try: # строит эмбеддинги пакетно phoneme_sent = self.model[text] except: # если символа нет в словаре эмбеддингов, строит поэлементно, заменяя неизвестный на вектор из 100 нулей phoneme_sent = [] for word in text: try: phoneme_word = self.model[word] except: print("unknown word", word) phoneme_word = np.zeros(100) phoneme_sent.append(phoneme_word) phoneme_sent = np.array(phoneme_sent) if len(phoneme_sent) < 100: # приведение к единому размеру 100 difference = 100 - len(phoneme_sent) part = np.zeros((difference, 100)) phoneme_sent = np.concatenate((part, phoneme_sent)) assert len(phoneme_sent ) == 100, "len of vector is inappropriate: {}".format( sentence) else: phoneme_sent = np.zeros((100, 100)) return phoneme_sent def get_accents(self, sentence): # выдает вектор из 0 и 1 - ударений в предложении vector = [] sentence = sentence.translate( sentence.maketrans( '', '', '!&?\./(){}[]"$%^*+=@№<>|–—_€£±•`≠…§~«»₽,:;')).lower() for word in sentence.split(): # ставит ударение в слове, если слово неизвестное, возвращается без ударения try: with warnings.catch_warnings(): warnings.simplefilter('ignore') accents = self.accentor(word) except: #print("unknown accent word: ", word) accents = [[word]] s = accents[0][0] vowels = "эоуиаеёюыяЭОУАЕИЁЫЮЯ" for letter, next_letter in zip(s, s[1:] + " "): # преобразование слов в бинарные вектора, где ударная гласная - 1, безударная 0 if letter in vowels: if next_letter == "+": vector.append(1) else: vector.append(0) if len(vector) < 29: # приведение векторов к стандартному размеру - 29 difference = 29 - len(vector) part = [0 for n in range(difference)] vector = part + vector assert len(vector) == 29, "len of vector is inappropriate: {}".format( sentence) return tuple(vector) def get_closest_vecs(self, vector, number=10): # возвращает список ближайших векторов в количестве number closest = [ self.t.get_item_vector(x) for x in self.t.get_nns_by_vector(vector, number) ] closest_int = [[int(x) for x in vector] for vector in closest] return closest_int def get_embeddings(self, vectors, source_embedding, number=1): # возвращает список ближайших предложений в количестве number possible_sentences = [] for vector in vectors: possible_sentences += self.dict_of_acc[tuple(vector)] possible_embs = [] embs_sentences = {} for sentence in possible_sentences: emb_sentence = self.get_phonemes(sentence) full_emb = np.concatenate(tuple(emb_sentence)) possible_embs.append(full_emb) full_emb = tuple(full_emb) if full_emb not in embs_sentences: embs_sentences[full_emb] = list() embs_sentences[full_emb].append(sentence) else: embs_sentences[full_emb].append(sentence) assert len( possible_embs ) >= number, "Number of nearest neighbors should be less than number of possible neighbors" source_embedding = np.concatenate(tuple(source_embedding)) final_sentences = [] neigh = NearestNeighbors(number) neigh.fit(possible_embs) nearest_neighbors = neigh.kneighbors([source_embedding], return_distance=False).tolist() for element in nearest_neighbors[0]: for sentence in embs_sentences[tuple(possible_embs[element])]: final_sentences.append(sentence.replace('\xa0', ' ')) return final_sentences @functools.lru_cache(maxsize=None) def accentor(self, word): return self.your_accentor.do_accents([[word]]) @functools.lru_cache(maxsize=None) def transcriptor(self, sentence): return self.your_transcriptor.transcribe([sentence])
def main(): parser = ArgumentParser() parser.add_argument( '-s', '--src', dest='source_data_file', type=str, required=True, help='Source file with phrases list to the g2p transforming.') parser.add_argument( '-d', '--dst', dest='destination_data_file', type=str, required=True, help= 'Destination file into which texts and created phonetical pronunciations, corresponding ' 'to these texts, will be written.') parser.add_argument( '-o', '--order', dest='pair_order', type=str, required=False, choices=['text-pronunciation', 'pronunciation-text'], default='pronunciation-text', help= 'Order of each pair: text and its pronunciation or pronunciation and corresponding text?' ) args = parser.parse_args() src_name = os.path.normpath(args.source_data_file) assert os.path.isfile(src_name), 'File "{0}" does not exist!'.format( src_name) dst_name = os.path.normpath(args.destination_data_file) dst_dir = os.path.dirname(dst_name) if len(dst_dir) > 0: assert os.path.isdir( dst_dir), 'Directory "{0}" does not exist!'.format(dst_dir) transcriptor = Transcription(raise_exceptions=True, verbose=False, batch_size=256, use_wiki=False) silence = '<sil>' with codecs.open(dst_name, mode='w', encoding='utf-8', errors='ignore') as dst_fp: for source_lines_v1, source_lines_v2 in iterate_by_texts(src_name): pronunciations_v1 = transcriptor.transcribe(source_lines_v1) pronunciations_v2 = transcriptor.transcribe(source_lines_v2) for line_idx in range(len(source_lines_v1)): pronunciation_v1 = pronunciations_v1[line_idx] if len(pronunciation_v1) > 0: transcription_v1 = [silence] for cur_part in pronunciation_v1: transcription_v1 += cur_part transcription_v1.append(silence) else: transcription_v1 = [] pronunciation_v2 = pronunciations_v2[line_idx] if len(pronunciation_v2) > 0: transcription_v2 = [silence] for cur_part in pronunciation_v2: transcription_v2 += cur_part transcription_v2.append(silence) else: transcription_v2 = [] source_text = source_lines_v2[line_idx] if (len(transcription_v1) > 0) and (len(transcription_v2) > 0): if args.pair_order == 'text-pronunciation': dst_fp.write('{0}\t{1}\n'.format( source_text.lower(), ' '.join(transcription_v1))) else: dst_fp.write('{0}\t{1}\n'.format( ' '.join(transcription_v1), source_text.lower())) if transcription_v2 != transcription_v1: if args.pair_order == 'text-pronunciation': dst_fp.write('{0}\t{1}\n'.format( source_text.lower(), ' '.join(transcription_v2))) else: dst_fp.write('{0}\t{1}\n'.format( ' '.join(transcription_v2), source_text.lower())) print('{0} texts have been processed...'.format( len(source_lines_v1))) del pronunciations_v1 del pronunciations_v2 del source_lines_v1 del source_lines_v2
for i, char in enumerate(text): if char == '2': try: _text.extend([_text[-1]]) except: print(''.join(text)) else: _text.extend([char]) text = ''.join(_text) return text manifests = ['../data/manifests/val_v05_cleaned_asr.csv', '../data/manifests/taptaxi_izhevsk_checked_14h.csv'] df = pd.concat([read_manifest(_) for _ in manifests]) text_paths = list(df.text_path.values) phoneme_paths = [_.replace('.txt','_phoneme.txt') for _ in text_paths] data = zip(text_paths, phoneme_paths) data = list(data) your_transcriptor = Transcription() text_tuples = [process_text_file(tup) for tup in tqdm(data)] proc_df = pd.DataFrame(text_tuples, columns=['text', 'phoneme']) proc_df.to_feather('text_to_phoneme.feather')
import pandas as pd from tqdm import tqdm from pandas.core.common import flatten from russian_g2p.Transcription import Transcription parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', type=str, help='pd.Dataframe with train meta location') parser.add_argument('-o', '--output', type=str, help='Where to store dict') args = parser.parse_args() word_regexp = re.compile(r'[А-яЁё]+') transcriptor = Transcription() if __name__ == '__main__': data = pd.read_csv(args.input, header=None, sep='|', names=['path', 'sentence', 'speaker']) sentences = data['sentence'].values phonemes_dict = {} for sent in tqdm(sentences): words_matches = list(word_regexp.finditer(sent)) for i_word_match, word_match in enumerate(words_matches): matched_word = word_match.group(0) matched_word_tokens = phonemes_dict.get(matched_word, None) if matched_word_tokens is None: