Пример #1
0
def test_preserve_2(text, expected):
    marks = ".!;:,?"
    punct = Punctuation(marks=marks)
    assert text == punct.restore(*punct.preserve(text))

    output = phonemize(text,
                       backend="espeak",
                       preserve_punctuation=True,
                       punctuation_marks=marks)
    assert output == expected
Пример #2
0
def test_preserve_2(text, output):
    marks = ".!;:,?"
    p = Punctuation(marks=marks)
    t, m = p.preserve(text)
    assert text == p.restore(t, m)

    o = phonemize(
        text, backend="espeak",
        preserve_punctuation=True, punctuation_marks=marks)
    assert o == output
Пример #3
0
class BaseBackend(abc.ABC):
    """Abstract base class of all the phonemization backends

    Provides a common interface to all backends. The central method is
    `phonemize()`

    Parameters
    ----------
    language (str): The language code of the input text, must be supported by
      the backend. If `backend` is 'segments', the language can be a file with
      a grapheme to phoneme mapping.

    preserve_punctuation (bool): When True, will keep the punctuation in the
      phonemized output. Not supported by the 'espeak-mbrola' backend. Default
      to False and remove all the punctuation.

    punctuation_marks (str): The punctuation marks to consider when dealing
      with punctuation, either for removal or preservation. Default to
      Punctuation.default_marks().

    logger (logging.Logger): the logging instance where to send
      messages. If not specified, use the default system logger.

    Raises
    ------
    RuntimeError if the backend is not available of if the `language` cannot be
    initialized.

    """
    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        # ensure the backend is installed on the system
        if not self.is_available():
            raise RuntimeError(  # pragma: nocover
                '{} not installed on your system'.format(self.name()))

        self._logger = logger
        self._logger.info('initializing backend %s-%s', self.name(),
                          '.'.join(str(v) for v in self.version()))

        # ensure the backend support the requested language
        self._language = self._init_language(language)

        # setup punctuation processing
        self._preserve_punctuation = preserve_punctuation
        self._punctuator = Punctuation(punctuation_marks)

    @classmethod
    def _init_language(cls, language):
        """Language initialization

        This method may be overloaded in child classes (see Segments backend)

        """
        if not cls.is_supported_language(language):
            raise RuntimeError(
                f'language "{language}" is not supported by the '
                f'{cls.name()} backend')
        return language

    @property
    def logger(self):
        """A logging.Logger instance where to send messages"""
        return self._logger

    @property
    def language(self):
        """The language code configured to be used for phonemization"""
        return self._language

    @staticmethod
    @abc.abstractmethod
    def name():
        """The name of the backend"""

    @classmethod
    @abc.abstractmethod
    def is_available(cls):
        """Returns True if the backend is installed, False otherwise"""

    @classmethod
    @abc.abstractmethod
    def version(cls):
        """Return the backend version as a tuple (major, minor, patch)"""

    @staticmethod
    @abc.abstractmethod
    def supported_languages():
        """Return a dict of language codes -> name supported by the backend"""

    @classmethod
    def is_supported_language(cls, language):
        """Returns True if `language` is supported by the backend"""
        return language in cls.supported_languages()

    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language

        Parameters
        ----------
        text (list of str): The text to be phonemized. Each string in the list
          is considered as a separated line. Each line is considered as a text
          utterance. Any empty utterance will be ignored.

        separator (Separator): string separators between phonemes, syllables
          and words, default to separator.default_separator. Syllable separator
          is considered only for the festival backend. Word separator is
          ignored by the 'espeak-mbrola' backend.

        strip (bool): If True, don't output the last word and phone separators
          of a token, default to False.

        njobs (int): The number of parallel jobs to launch. The input text is
          split in `njobs` parts, phonemized on parallel instances of the
          backend and the outputs are finally collapsed.

        Returns
        -------
        phonemized text (list of str) : The input `text` phonemized for the
          given `language` and `backend`.

        Raises
        ------
        RuntimeError if something went wrong during the phonemization

        """
        if isinstance(text, str):
            # changed in phonemizer-3.0, warn the user
            self.logger.error(
                'input text to phonemize() is str but it must be list')

        text, punctuation_marks = self._phonemize_preprocess(text)

        if njobs == 1:
            # phonemize the text forced as a string
            phonemized = self._phonemize_aux(text, 0, separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)

            # we have here a list of phonemized chunks
            phonemized = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(
                    # chunk[0] is the text, chunk[1] is the offset
                    chunk[0],
                    chunk[1],
                    separator,
                    strip) for chunk in zip(*chunks(text, njobs)))

            # flatten them in a single list
            phonemized = self._flatten(phonemized)

        return self._phonemize_postprocess(phonemized, punctuation_marks)

    @staticmethod
    def _flatten(phonemized):
        """Flatten a list of lists into a single one

        From [[1, 2], [3], [4]] returns [1, 2, 3, 4]. This method is used to
        format the output as obtained using multiple jobs.

        """
        return list(itertools.chain(*phonemized))

    @abc.abstractmethod
    def _phonemize_aux(self, text, offset, separator, strip):
        """The "concrete" phonemization method

        Must be implemented in child classes. `separator` and `strip`
        parameters are as given to the phonemize() method. `text` is as
        returned by _phonemize_preprocess(). `offset` is line number of the
        first line in `text` with respect to the original text (this is only
        usefull with running on chunks in multiple jobs. When using a single
        jobs the offset is 0).

        """

    def _phonemize_preprocess(self, text):
        """Preprocess the text before phonemization

        Removes the punctuation (keep trace of punctuation marks for further
        restoration if required by the `preserve_punctuation` option).

        """
        if self._preserve_punctuation:
            # a tuple (text, punctuation marks)
            return self._punctuator.preserve(text)
        return self._punctuator.remove(text), []

    def _phonemize_postprocess(self, phonemized, punctuation_marks):
        """Postprocess the raw phonemized output

        Restores the punctuation as needed.

        """
        if self._preserve_punctuation:
            return self._punctuator.restore(phonemized, punctuation_marks)
        return phonemized
Пример #4
0
def test_preserve(inp):
    punct = Punctuation()
    text, marks = punct.preserve(inp)
    assert inp == punct.restore(text, marks)
Пример #5
0
class BaseBackend(object):
    """Abstract base class of all the phonemization backends

    Provides a common interface to all backends. The central method is
    `phonemize()`

    """
    __metaclass__ = abc.ABCMeta

    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        # ensure the backend is installed on the system
        if not self.is_available():
            raise RuntimeError(  # pragma: nocover
                '{} not installed on your system'.format(self.name()))

        self.logger = logger
        self.logger.info('initializing backend %s-%s', self.name(),
                         self.version())

        # ensure the backend support the requested language
        if not self.is_supported_language(language):
            raise RuntimeError(
                'language "{}" is not supported by the {} backend'.format(
                    language, self.name()))
        self.language = language

        # setup punctuation processing
        self.preserve_punctuation = preserve_punctuation
        self._punctuator = Punctuation(punctuation_marks)

    @staticmethod
    @abc.abstractmethod
    def name():
        """The name of the backend"""
        pass

    @classmethod
    @abc.abstractmethod
    def is_available(cls):
        """Returns True if the backend is installed, False otherwise"""
        pass

    @staticmethod
    @abc.abstractmethod
    def version():
        """Return the backend version as a string 'major.minor.patch'"""
        pass

    @staticmethod
    @abc.abstractmethod
    def supported_languages():
        """Return a dict of language codes -> name supported by the backend"""
        pass

    @classmethod
    @abc.abstractmethod
    def is_supported_language(cls, language):
        """Returns True if `language` is supported by the backend"""
        return language in cls.supported_languages()

    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language"""
        # remember the text type for output (either list or string)
        text_type = type(text)

        # deals with punctuation: remove it and keep track of it for
        # restoration at the end if asked for
        punctuation_marks = []
        if self.preserve_punctuation:
            text, punctuation_marks = self._punctuator.preserve(text)
        else:
            text = self._punctuator.remove(text)

        if njobs == 1:
            # phonemize the text forced as a string
            text = self._phonemize_aux(list2str(text), separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # we have here a list of phonemized chunks
            text = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in chunks(text, njobs))

            # flatten them in a single list
            text = list(itertools.chain(*text))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        # restore the punctuation is asked for
        if self.preserve_punctuation:
            text = self._punctuator.restore(text, punctuation_marks)

        # output the result formatted as a string or a list of strings
        # according to type(text)
        return (list2str(text)
                if text_type in six.string_types else str2list(text))

    @abc.abstractmethod
    def _phonemize_aux(self, text, separator, strip):
        pass
Пример #6
0
def test_preserve(inp):
    p = Punctuation()
    t, m = p.preserve(inp)
    assert inp == p.restore(t, m)