Пример #1
0
def rus_to_phonemes(text):
    text = text.replace('-', '—')
    text = text.replace('…', '.')
    punctuation_marks = ';:,.!?¡¿—…"«»“”()'
    transcriptor = Transcription()
    phonemes = ''
    queue = Queue()
    for i in text:
        if i in punctuation_marks:
            queue.put(' ' + i + ' ')
    for it in transcriptor.transcribe([text]):
        for seq in it:
            if queue.empty():
                phonemes += ' '.join(seq)
                continue
            phonemes += ' '.join(seq) + queue.get()
    return phonemes.split()
Пример #2
0
    def __init__(self,
                 path_to_w2v='modelphonemes.model',
                 path_to_annoy='annoy_index.ann',
                 path_to_dict='data.pkl'):
        self.your_transcriptor = Transcription()
        self.your_accentor = Accentor()
        if os.path.isfile(path_to_w2v):
            self.model = gensim.models.Word2Vec.load(path_to_w2v)
        else:
            raise IOError("File {} does not exist!".format(path_to_w2v))
        if os.path.isfile(path_to_dict):
            with open(path_to_dict, 'rb') as f:
                self.dict_of_acc = pickle.load(f)
        else:
            raise IOError("File {} does not exist!".format(path_to_dict))
        self.accents = list(self.dict_of_acc.keys())

        f = len(self.accents[0])
        self.t = AnnoyIndex(f, 'hamming')
        if os.path.isfile(path_to_annoy):
            self.t.load(path_to_annoy)
        else:
            raise IOError("File {} does not exist!".format(path_to_annoy))
Пример #3
0
class RussianG2P:
    def __init__(self, word_separator="\t"):
        self.transcriptor = Transcription()
        self.word_separator = word_separator

    def __call__(self,
                 graphemes_batch,
                 return_word_to_phonemes_dictionary=False):
        processed_batch = []
        word_to_phonemes_dictionary = {}

        for graphemes in graphemes_batch:
            words = []
            phonemes = []
            word = ""

            for grapheme in graphemes + [self.word_separator]:
                if grapheme == self.word_separator:
                    if word.isalpha():
                        words.append(word)
                    word = ""
                else:
                    word += grapheme

            g2p_output = self.transcriptor.transcribe(words)
            for word, word_phonemes in zip(words, g2p_output):

                if not word_phonemes:
                    continue

                # Some words have different variant to pronounce,
                # We are selecting one of them
                word_phonemes = word_phonemes[0]
                phonemes += word_phonemes + [self.word_separator]

                if return_word_to_phonemes_dictionary:
                    word_to_phonemes_dictionary[word] = word_phonemes

            processed_batch.append(phonemes)

        if return_word_to_phonemes_dictionary:
            return processed_batch, word_to_phonemes_dictionary

        return processed_batch
Пример #4
0
class TestAll(unittest.TestCase):
    def setUp(self):
        self.__transcription = Transcription()

    def tearDown(self):
        del self.__transcription

    def test_normal(self):
        source_phrase = 'Мама мыла раму'
        target_variants = [[
            'M', 'A0', 'M', 'A', 'M', 'Y0', 'L', 'A', 'R', 'A0', 'M', 'U'
        ]]
        real_variants = self.__transcription.transcribe([source_phrase])[0]
        self.assertEqual(target_variants, real_variants)

    def test_symbols(self):
        source_phrase = 'Мама мыла ра-му, а ты?! - Нет.'
        target_variants = [[
            'M', 'A0', 'M', 'A', 'M', 'Y0', 'L', 'A', 'R', 'A0', 'M', 'U0'
        ], ['A', 'T', 'Y0'], ['N0', 'E0', 'T']]
        real_variants = self.__transcription.transcribe([source_phrase])[0]
        self.assertEqual(target_variants, real_variants)

    def test_nothing(self):
        source_phrase = '...'
        target_variants = []
        real_variants = self.__transcription.transcribe([source_phrase])[0]
        self.assertEqual(target_variants, real_variants)

    def test_begin(self):
        source_phrase = '- Ага'
        target_variants = [['A', 'G', 'A0']]
        real_variants = self.__transcription.transcribe([source_phrase])[0]
        self.assertEqual(target_variants, real_variants)

    def test_accented(self):
        source_phrase_1 = 'диалог был'
        real_variants_1 = self.__transcription.transcribe([source_phrase_1])[0]
        source_phrase_2 = 'диало+г бы+л'
        real_variants_2 = self.__transcription.transcribe([source_phrase_2])[0]
        self.assertEqual(real_variants_1, real_variants_2)
Пример #5
0
 def __init__(self, word_separator="\t"):
     self.transcriptor = Transcription()
     self.word_separator = word_separator
Пример #6
0
 def setUp(self):
     self.__transcription = Transcription()
Пример #7
0
class PhoneticIndex(object):
    def __init__(self,
                 path_to_w2v='modelphonemes.model',
                 path_to_annoy='annoy_index.ann',
                 path_to_dict='data.pkl'):
        self.your_transcriptor = Transcription()
        self.your_accentor = Accentor()
        if os.path.isfile(path_to_w2v):
            self.model = gensim.models.Word2Vec.load(path_to_w2v)
        else:
            raise IOError("File {} does not exist!".format(path_to_w2v))
        if os.path.isfile(path_to_dict):
            with open(path_to_dict, 'rb') as f:
                self.dict_of_acc = pickle.load(f)
        else:
            raise IOError("File {} does not exist!".format(path_to_dict))
        self.accents = list(self.dict_of_acc.keys())

        f = len(self.accents[0])
        self.t = AnnoyIndex(f, 'hamming')
        if os.path.isfile(path_to_annoy):
            self.t.load(path_to_annoy)
        else:
            raise IOError("File {} does not exist!".format(path_to_annoy))

    def transform(self, sentence, acc_number=10, sent_number=1):
        assert acc_number >= sent_number, "number of variants for nearest neighbors should be bigger than number of nearest sentences"

        phonemes = self.get_phonemes(sentence)

        accents = self.get_accents(sentence)

        closest_vectors = self.get_closest_vecs(accents, number=acc_number)

        closest_sentences = self.get_embeddings(closest_vectors,
                                                phonemes,
                                                number=sent_number)

        return closest_sentences

    def get_phonemes(self, sentence):
        # выдает транскрипцию
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            new_sentence = self.transcriptor(sentence)

        text = []

        for string in new_sentence[0]:
            for phoneme in string:
                text.append(phoneme)

        if len(text) != 0:
            try:
                # строит эмбеддинги пакетно
                phoneme_sent = self.model[text]

            except:
                # если символа нет в словаре эмбеддингов, строит поэлементно, заменяя неизвестный на вектор из 100 нулей
                phoneme_sent = []
                for word in text:
                    try:
                        phoneme_word = self.model[word]
                    except:
                        print("unknown word", word)
                        phoneme_word = np.zeros(100)
                    phoneme_sent.append(phoneme_word)
                phoneme_sent = np.array(phoneme_sent)

            if len(phoneme_sent) < 100:
                # приведение к единому размеру 100
                difference = 100 - len(phoneme_sent)
                part = np.zeros((difference, 100))
                phoneme_sent = np.concatenate((part, phoneme_sent))

            assert len(phoneme_sent
                       ) == 100, "len of vector is inappropriate: {}".format(
                           sentence)
        else:
            phoneme_sent = np.zeros((100, 100))

        return phoneme_sent

    def get_accents(self, sentence):
        # выдает вектор из 0 и 1 - ударений в предложении
        vector = []
        sentence = sentence.translate(
            sentence.maketrans(
                '', '', '!&?\./(){}[]"$%^*+=@№<>|–—_€£±•`≠…§~«»₽,:;')).lower()
        for word in sentence.split():
            # ставит ударение в слове, если слово неизвестное, возвращается без ударения
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore')
                    accents = self.accentor(word)

            except:
                #print("unknown accent word: ", word)
                accents = [[word]]

            s = accents[0][0]
            vowels = "эоуиаеёюыяЭОУАЕИЁЫЮЯ"
            for letter, next_letter in zip(s, s[1:] + " "):
                # преобразование слов в бинарные вектора, где ударная гласная - 1, безударная 0
                if letter in vowels:
                    if next_letter == "+":
                        vector.append(1)
                    else:
                        vector.append(0)

        if len(vector) < 29:
            # приведение векторов к стандартному размеру - 29
            difference = 29 - len(vector)
            part = [0 for n in range(difference)]
            vector = part + vector

        assert len(vector) == 29, "len of vector is inappropriate: {}".format(
            sentence)
        return tuple(vector)

    def get_closest_vecs(self, vector, number=10):
        # возвращает список ближайших векторов в количестве number
        closest = [
            self.t.get_item_vector(x)
            for x in self.t.get_nns_by_vector(vector, number)
        ]

        closest_int = [[int(x) for x in vector] for vector in closest]

        return closest_int

    def get_embeddings(self, vectors, source_embedding, number=1):
        # возвращает список ближайших предложений в количестве number

        possible_sentences = []
        for vector in vectors:
            possible_sentences += self.dict_of_acc[tuple(vector)]
        possible_embs = []
        embs_sentences = {}
        for sentence in possible_sentences:
            emb_sentence = self.get_phonemes(sentence)
            full_emb = np.concatenate(tuple(emb_sentence))
            possible_embs.append(full_emb)
            full_emb = tuple(full_emb)
            if full_emb not in embs_sentences:
                embs_sentences[full_emb] = list()
                embs_sentences[full_emb].append(sentence)
            else:
                embs_sentences[full_emb].append(sentence)

        assert len(
            possible_embs
        ) >= number, "Number of nearest neighbors should be less than number of possible neighbors"
        source_embedding = np.concatenate(tuple(source_embedding))
        final_sentences = []
        neigh = NearestNeighbors(number)
        neigh.fit(possible_embs)
        nearest_neighbors = neigh.kneighbors([source_embedding],
                                             return_distance=False).tolist()
        for element in nearest_neighbors[0]:
            for sentence in embs_sentences[tuple(possible_embs[element])]:
                final_sentences.append(sentence.replace('\xa0', ' '))
        return final_sentences

    @functools.lru_cache(maxsize=None)
    def accentor(self, word):
        return self.your_accentor.do_accents([[word]])

    @functools.lru_cache(maxsize=None)
    def transcriptor(self, sentence):
        return self.your_transcriptor.transcribe([sentence])
Пример #8
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        '-s',
        '--src',
        dest='source_data_file',
        type=str,
        required=True,
        help='Source file with phrases list to the g2p transforming.')
    parser.add_argument(
        '-d',
        '--dst',
        dest='destination_data_file',
        type=str,
        required=True,
        help=
        'Destination file into which texts and created phonetical pronunciations, corresponding '
        'to these texts, will be written.')
    parser.add_argument(
        '-o',
        '--order',
        dest='pair_order',
        type=str,
        required=False,
        choices=['text-pronunciation', 'pronunciation-text'],
        default='pronunciation-text',
        help=
        'Order of each pair: text and its pronunciation or pronunciation and corresponding text?'
    )
    args = parser.parse_args()

    src_name = os.path.normpath(args.source_data_file)
    assert os.path.isfile(src_name), 'File "{0}" does not exist!'.format(
        src_name)

    dst_name = os.path.normpath(args.destination_data_file)
    dst_dir = os.path.dirname(dst_name)
    if len(dst_dir) > 0:
        assert os.path.isdir(
            dst_dir), 'Directory "{0}" does not exist!'.format(dst_dir)

    transcriptor = Transcription(raise_exceptions=True,
                                 verbose=False,
                                 batch_size=256,
                                 use_wiki=False)
    silence = '<sil>'
    with codecs.open(dst_name, mode='w', encoding='utf-8',
                     errors='ignore') as dst_fp:
        for source_lines_v1, source_lines_v2 in iterate_by_texts(src_name):
            pronunciations_v1 = transcriptor.transcribe(source_lines_v1)
            pronunciations_v2 = transcriptor.transcribe(source_lines_v2)
            for line_idx in range(len(source_lines_v1)):
                pronunciation_v1 = pronunciations_v1[line_idx]
                if len(pronunciation_v1) > 0:
                    transcription_v1 = [silence]
                    for cur_part in pronunciation_v1:
                        transcription_v1 += cur_part
                        transcription_v1.append(silence)
                else:
                    transcription_v1 = []
                pronunciation_v2 = pronunciations_v2[line_idx]
                if len(pronunciation_v2) > 0:
                    transcription_v2 = [silence]
                    for cur_part in pronunciation_v2:
                        transcription_v2 += cur_part
                        transcription_v2.append(silence)
                else:
                    transcription_v2 = []
                source_text = source_lines_v2[line_idx]
                if (len(transcription_v1) > 0) and (len(transcription_v2) > 0):
                    if args.pair_order == 'text-pronunciation':
                        dst_fp.write('{0}\t{1}\n'.format(
                            source_text.lower(), ' '.join(transcription_v1)))
                    else:
                        dst_fp.write('{0}\t{1}\n'.format(
                            ' '.join(transcription_v1), source_text.lower()))
                    if transcription_v2 != transcription_v1:
                        if args.pair_order == 'text-pronunciation':
                            dst_fp.write('{0}\t{1}\n'.format(
                                source_text.lower(),
                                ' '.join(transcription_v2)))
                        else:
                            dst_fp.write('{0}\t{1}\n'.format(
                                ' '.join(transcription_v2),
                                source_text.lower()))
            print('{0} texts have been processed...'.format(
                len(source_lines_v1)))
            del pronunciations_v1
            del pronunciations_v2
            del source_lines_v1
            del source_lines_v2
Пример #9
0
        for i, char in enumerate(text):
            if char == '2':
                try:
                    _text.extend([_text[-1]])
                except:
                    print(''.join(text))
            else:
                _text.extend([char])
        text = ''.join(_text)
    return text


manifests = ['../data/manifests/val_v05_cleaned_asr.csv',
             '../data/manifests/taptaxi_izhevsk_checked_14h.csv']

df = pd.concat([read_manifest(_) for _ in manifests])

text_paths = list(df.text_path.values)
phoneme_paths = [_.replace('.txt','_phoneme.txt') for _ in text_paths]

data = zip(text_paths,
           phoneme_paths)

data = list(data)

your_transcriptor = Transcription()

text_tuples = [process_text_file(tup) for tup in tqdm(data)]

proc_df = pd.DataFrame(text_tuples, columns=['text', 'phoneme'])
proc_df.to_feather('text_to_phoneme.feather')
Пример #10
0
import pandas as pd
from tqdm import tqdm
from pandas.core.common import flatten

from russian_g2p.Transcription import Transcription

parser = argparse.ArgumentParser()
parser.add_argument('-i',
                    '--input',
                    type=str,
                    help='pd.Dataframe with train meta location')
parser.add_argument('-o', '--output', type=str, help='Where to store dict')
args = parser.parse_args()

word_regexp = re.compile(r'[А-яЁё]+')
transcriptor = Transcription()

if __name__ == '__main__':
    data = pd.read_csv(args.input,
                       header=None,
                       sep='|',
                       names=['path', 'sentence', 'speaker'])
    sentences = data['sentence'].values

    phonemes_dict = {}
    for sent in tqdm(sentences):
        words_matches = list(word_regexp.finditer(sent))
        for i_word_match, word_match in enumerate(words_matches):
            matched_word = word_match.group(0)
            matched_word_tokens = phonemes_dict.get(matched_word, None)
            if matched_word_tokens is None: