def __init__(self, text_cleaners=['basic_cleaners'], use_phonemes=True, n_jobs=1, with_stress=True): """ Text sequencies preprocessor with G2P support. :param text_cleaners: text cleaner type: * `basic_cleaners`: basic pipeline that lowercases and collapses whitespace without transliteration. * `transliteration_cleaners`: pipeline for non-English text that transliterates to ASCII. * `english_cleaners`: pipeline for English text, including number and abbreviation expansion. :param use_phonemes: file path with phonemes set separated by `|` :param n_jobs: number of workers for phonemization :param with_stress: set `True` to stress words during phonemization """ self.text_cleaners = text_cleaners self.use_phonemes = use_phonemes self.n_jobs = n_jobs self.with_stress = with_stress CHARS = _GRAPHEMES if not self.use_phonemes else _PHONEMES self.SYMBOLS = [_PAD, _EOS, _SPACE] + _PUNCTUATIONS + _NUMBERS + CHARS # Mappings from symbol to numeric ID and vice versa: self._symbol_to_id = {s: i for i, s in enumerate(self.SYMBOLS)} self._id_to_symbol = {i: s for i, s in enumerate(self.SYMBOLS)} self._separator = Separator(word=_WORD_SEP, syllable='', phone=_PHONEME_SEP)
def test_french(): backend = EspeakBackend('fr-fr') text = ['bonjour le monde'] sep = Separator(word=';eword ', syllable=None, phone=' ') expected = ['b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword '] out = backend.phonemize(text, sep, False) assert out == expected
def __init__( self, backend, word_separator: Optional[str] = None, syllable_separator: Optional[str] = None, phone_separator: Optional[str] = " ", strip=False, split_by_single_token: bool = False, **phonemizer_kwargs, ): # delayed import from phonemizer.backend import BACKENDS from phonemizer.separator import Separator self.separator = Separator( word=word_separator, syllable=syllable_separator, phone=phone_separator, ) # define logger to suppress the warning in phonemizer logger = logging.getLogger("phonemizer") logger.setLevel(logging.ERROR) self.phonemizer = BACKENDS[backend]( **phonemizer_kwargs, logger=logger, ) self.strip = strip self.split_by_single_token = split_by_single_token
def test_separator_3(): backend = SegmentsBackend('cree') text = ['achi acho'] sep = Separator(word=' ', syllable=None, phone='_') assert backend.phonemize(text, separator=sep) == [u'ʌ_tʃ_ɪ_ ʌ_tʃ_ʊ_ '] assert backend.phonemize(text, separator=sep, strip=True) \ == [u'ʌ_tʃ_ɪ ʌ_tʃ_ʊ']
def test_no_switch(policy, caplog): text = ["j'aime l'anglais", "tu parles le français"] backend = EspeakBackend('fr-fr', language_switch=policy) out = backend.phonemize(text, separator=Separator(), strip=True) assert out == ['ʒɛm lɑ̃ɡlɛ', 'ty paʁl lə fʁɑ̃sɛ'] messages = [msg[2] for msg in caplog.record_tuples] assert not messages
def test_separator_5(): backend = SegmentsBackend('cree') text = ['achi acho'] sep = Separator(phone=' ', word='_') assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _'] assert backend.phonemize(text, separator=sep, strip=True) \ == [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
def test_separator_4(): backend = SegmentsBackend('cree') text = ['achi acho'] # TODO bug when sep.phone == ' ' with no sep.word sep = Separator(phone=' ', word='') assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ ʌ tʃ ʊ '] assert backend.phonemize(text, separator=sep, strip=True) \ == [u'ʌ tʃ ɪʌ tʃ ʊ']
def test_tie_simple(caplog, tie, expected): backend = EspeakBackend('en-us', tie=tie) assert backend.phonemize(['Jackie Chan'], separator=Separator(word=' ', phone='_'))[0] == expected if tie: messages = [msg[2] for msg in caplog.record_tuples] assert ( 'cannot use ties AND phone separation, ignoring phone separator' in messages)
def test_phone_separator_simple(): text = ['The lion and the tiger ran'] sep = Separator(phone='_') backend = EspeakBackend('en-us') output = backend.phonemize(text, separator=sep, strip=True) expected = ['ð_ə l_aɪə_n æ_n_d ð_ə t_aɪ_ɡ_ɚ ɹ_æ_n'] assert expected == output output = backend.phonemize(text, separator=sep, strip=False) expected = ['ð_ə_ l_aɪə_n_ æ_n_d_ ð_ə_ t_aɪ_ɡ_ɚ_ ɹ_æ_n_ '] assert expected == output
def test_arabic(): backend = EspeakBackend('ar') text = ['السلام عليكم'] sep = Separator() # Arabic seems to have changed starting at espeak-ng-1.49.3 if EspeakBackend.version() >= (1, 49, 3): expected = ['ʔassalaːm ʕliːkm '] else: expected = ['ʔassalaam ʕaliijkum '] out = backend.phonemize(text, sep, False) assert out == expected
def _phonemize(text, language): try: seperators = Separator(word=' ', phone='') phonemes = phonemize(text, separator=seperators, backend='espeak', language=language) except RuntimeError: epi = epitran.Epitran(language) phonemes = epi.transliterate(text, normpunc=True) phonemes.replace('\n', ' ', 1) return phonemes
def test_french_sampa(backend): text = ['bonjour le monde'] sep = Separator(word=None, phone=' ') expected = ['b o~ Z u R l @ m o~ d '] out = backend.phonemize(text, separator=sep, strip=False) assert out == expected expected = ['b o~ Z u R l @ m o~ d'] out = backend.phonemize(text, separator=sep, strip=True) assert out == expected assert backend.phonemize([''], separator=sep, strip=True) == [''] assert backend.phonemize(['"'], separator=sep, strip=True) == ['']
def test_language_switch_remove_utterance(caplog, langswitch_text, njobs): backend = EspeakBackend('fr-fr', language_switch='remove-utterance') out = backend.phonemize(langswitch_text, separator=Separator(), strip=True, njobs=njobs) assert out == ['ʒɛm lɑ̃ɡlɛ', '', '', '', ''] messages = [msg[2] for msg in caplog.record_tuples] assert ('removed 4 utterances containing language switches ' '(applying "remove-utterance" policy)' in messages) with pytest.raises(RuntimeError): backend = EspeakBackend('fr-fr', language_switch='foo')
def extract_phonemes(filename): from phonemizer.phonemize import phonemize from phonemizer.backend import FestivalBackend from phonemizer.separator import Separator with open(filename) as f: text = f.read() phones = phonemize(text, language='en-us', backend='festival', separator=Separator(phone=' ', syllable='', word='')) with open(filename.replace(".txt", ".phones"), "w") as outfile: print(phones, file=outfile)
def __init__( self, word_separator: Optional[str] = None, syllable_separator: Optional[str] = None, **phonemize_kwargs, ): # delayed import from phonemizer import phonemize from phonemizer.separator import Separator self.phonemize = phonemize self.separator = Separator(word=word_separator, syllable=syllable_separator, phone=" ") self.phonemize_kwargs = phonemize_kwargs
def test_language_switch_remove_flags(caplog, langswitch_text, njobs): backend = EspeakBackend('fr-fr', language_switch='remove-flags') out = backend.phonemize(langswitch_text, separator=Separator(), strip=True, njobs=njobs) assert out == [ 'ʒɛm lɑ̃ɡlɛ', 'ʒɛm lə fʊtbɔːl', 'fʊtbɔːl', 'syʁtu lə ɹiəl madʁid', 'nytiliz pa ɡuːɡəl' ] messages = [msg[2] for msg in caplog.record_tuples] assert ('4 utterances containing language switches on lines 2, 3, 4, 5' in messages) assert ('language switch flags have been removed ' '(applying "remove-flags" policy)' in messages)
def extract_phonemes(filename): from phonemizer.phonemize import phonemize from phonemizer.backend import FestivalBackend from phonemizer.separator import Separator #FestivalBackend.set_festival_path("/home/zhoukun/merlin/tools/festival/src/main/festival") with open(filename) as f: text=f.read() phones = phonemize(text, language='en-us', backend='festival', separator=Separator(phone=' ', syllable='', word='')) filename = filename.replace('/data06_2/', '/data07/zhoukun/') file_test = filename[:-13] if not os.path.isdir(file_test): os.mkdir(file_test) with open(filename.replace(".txt", ".phones"), "w") as outfile: print(phones, file=outfile)
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str: from phonemizer.separator import Separator word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else "" if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang: self.init_backend(phonemizer_lang) else: phonemizer_lang = self.phonemizer_lang separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="") phonemes = self.backend.phonemize( [text], separator=separator, ) phonemes = phonemes[0].strip() return phonemes
def test_bad_backend(): with pytest.raises(RuntimeError): phonemize('', backend='fetiv') with pytest.raises(RuntimeError): phonemize('', backend='foo') with pytest.raises(RuntimeError): phonemize('', tie=True, backend='festival') with pytest.raises(RuntimeError): phonemize('', tie=True, backend='mbrola') with pytest.raises(RuntimeError): phonemize('', tie=True, backend='segments') with pytest.raises(RuntimeError): phonemize('', tie=True, backend='espeak', separator=Separator(' ', None, '-'))
def test_language_switch_default(caplog, langswitch_text, njobs): # default behavior is to keep the flags backend = EspeakBackend('fr-fr') out = backend.phonemize(langswitch_text, separator=Separator(), strip=True, njobs=njobs) assert out == [ 'ʒɛm lɑ̃ɡlɛ', 'ʒɛm lə (en)fʊtbɔːl(fr)', '(en)fʊtbɔːl(fr)', 'syʁtu lə (en)ɹiəl(fr) madʁid', 'nytiliz pa (en)ɡuːɡəl(fr)' ] messages = [msg[2] for msg in caplog.record_tuples] assert ('4 utterances containing language switches on lines 2, 3, 4, 5' in messages) assert ( 'language switch flags have been kept (applying "keep-flags" policy)' in messages)
def __init__( self, word_separator: Optional[str] = None, syllable_separator: Optional[str] = None, phone_separator: Optional[str] = " ", split_by_single_token: bool = False, **phonemize_kwargs, ): # delayed import from phonemizer import phonemize from phonemizer.separator import Separator self.phonemize = phonemize self.separator = Separator( word=word_separator, syllable=syllable_separator, phone=phone_separator, ) self.split_by_single_token = split_by_single_token self.phonemize_kwargs = phonemize_kwargs
def ipa_phonemize(text, lang="en-us", use_g2p=False): if use_g2p: assert lang == "en-us", "g2pE phonemizer only works for en-us" try: from g2p_en import G2p g2p = G2p() return " ".join("|" if p == " " else p for p in g2p(text)) except ImportError: raise ImportError("Please install phonemizer: pip install g2p_en") else: try: from phonemizer import phonemize from phonemizer.separator import Separator return phonemize(text, backend='espeak', language=lang, separator=Separator(word="| ", phone=" ")) except ImportError: raise ImportError( "Please install phonemizer: pip install phonemizer")
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str: requires_backends(self, "phonemizer") from phonemizer import phonemize from phonemizer.separator import Separator word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else "" phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="") phonemes = phonemize( text, language=phonemizer_lang, backend=self.phonemizer_backend, separator=separator, language_switch="remove-flags", ) phonemes = phonemes.strip() return phonemes
def phonemize( cls, text: str, lang: Optional[str], phonemizer: Optional[str] = None, preserve_punct: bool = False, to_simplified_zh: bool = False, ): if to_simplified_zh: import hanziconv text = hanziconv.HanziConv.toSimplified(text) if phonemizer == "g2p": import g2p_en g2p = g2p_en.G2p() if preserve_punct: return " ".join("|" if p == " " else p for p in g2p(text)) else: res = [{",": "sp", ";": "sp"}.get(p, p) for p in g2p(text)] return " ".join(p for p in res if p.isalnum()) if phonemizer == "g2pc": import g2pc g2p = g2pc.G2pC() return " ".join([w[3] for w in g2p(text)]) elif phonemizer == "ipa": assert lang is not None import phonemizer from phonemizer.separator import Separator lang_map = {"en": "en-us", "fr": "fr-fr"} return phonemizer.phonemize( text, backend="espeak", language=lang_map.get(lang, lang), separator=Separator(word="| ", phone=" "), ) else: return text
def test_equal(): assert Separator() == Separator() assert default_separator == Separator(phone='', syllable='', word=' ') assert Separator(word=' ') != default_separator
def test_str(): separator = Separator(word='w', syllable='s', phone='p') assert str(separator) == '(phone: "p", syllable: "s", word: "w")' assert str(default_separator) == '(phone: "", syllable: "", word: " ")'
def test_same(): with pytest.raises(ValueError): Separator(word=' ', phone=' ')
def test_empty(val): s = Separator(val, val, val) assert s.phone == '' assert s.syllable == '' assert s.word == ''
def test_sampa_fr(text, expected): assert expected == EspeakMbrolaBackend('mb-fr1').phonemize( text, strip=True, separator=Separator(phone=''))
def test_im(): sep = Separator(word=' ', syllable='', phone='') assert _test(["I'm looking for an image"], sep) \ == ['aym luhkaxng faor axn ihmaxjh'] assert _test(["Im looking for an image"], sep) \ == ['ihm luhkaxng faor axn ihmaxjh']