def test_preallocate_smaller(self): audiofile = AudioFile() audiofile.preallocate_memory(100) self.assertEqual(len(audiofile.audio_samples), 0) audiofile.add_samples(numpy.array([1, 2, 3, 4, 5])) self.assertEqual(len(audiofile.audio_samples), 5) audiofile.preallocate_memory(2) self.assertEqual(len(audiofile.audio_samples), 2)
def test_add_samples_reverse_memory(self): audiofile = AudioFile() audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]), reverse=True) audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]), reverse=True) self.assertEqual(len(audiofile.audio_samples), 10) self.assertEqual(audiofile.audio_samples[0], 5) self.assertEqual(audiofile.audio_samples[1], 4) self.assertEqual(audiofile.audio_samples[4], 1) self.assertEqual(audiofile.audio_samples[5], 10) self.assertEqual(audiofile.audio_samples[6], 9) self.assertEqual(audiofile.audio_samples[9], 6)
def test_add_samples_memory(self): audiofile = AudioFile() audiofile.add_samples(numpy.array([1, 2, 3, 4, 5])) audiofile.add_samples(numpy.array([6, 7, 8, 9, 10])) self.assertEqual(len(audiofile.audio_samples), 10) self.assertEqual(audiofile.audio_samples[0], 1) self.assertEqual(audiofile.audio_samples[1], 2) self.assertEqual(audiofile.audio_samples[4], 5) self.assertEqual(audiofile.audio_samples[5], 6) self.assertEqual(audiofile.audio_samples[6], 7) self.assertEqual(audiofile.audio_samples[9], 10)
def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = "pcm16" output_file.audio_channels = 1 output_file.audio_sample_rate = self.SAMPLE_RATE # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([u"Quitting after reached duration %.3f", current_time]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc(u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc)) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = "pcm16" output_file.audio_channels = 1 output_file.audio_sample_rate = self.SAMPLE_RATE # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code) # store for later output anchors.append( [current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([ u"Quitting after reached duration %.3f", current_time ]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc( u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc)) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_generic(self, helper_function, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple fragments, generic function. The ``helper_function`` is a function that takes parameters ``(text, voice_code, output_file_path)`` and returns a tuple ``(result, (audio_length, audio_sample_rate, audio_format, audio_samples))``. :rtype: tuple (result, (anchors, current_time, num_chars)) """ self.log(u"Calling TTS engine using multiple generic function...") # get sample rate and codec self.log(u"Determining codec and sample rate...") if (self.OUTPUT_AUDIO_FORMAT is None) or (len(self.OUTPUT_AUDIO_FORMAT) != 3): self.log(u"Determining codec and sample rate with dummy text...") succeeded, data = helper_function( text=u"Dummy text to get sample_rate", voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE), output_file_path=None) if not succeeded: self.log_crit( u"An unexpected error occurred in helper_function") return (False, None) du_nu, sample_rate, codec, da_nu = data self.log( u"Determining codec and sample rate with dummy text... done") else: self.log(u"Reading codec and sample rate from OUTPUT_AUDIO_FORMAT") codec, channels_nu, sample_rate = self.OUTPUT_AUDIO_FORMAT self.log(u"Determining codec and sample rate... done") self.log([u" codec: %s", codec]) self.log([u" sample rate: %d", sample_rate]) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = codec output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] loop_function = self._loop_use_cache if self.use_cache else self._loop_no_cache for num, fragment in enumerate(fragments): succeeded, data = loop_function(helper_function=helper_function, num=num, fragment=fragment) if not succeeded: self.log_crit(u"An unexpected error occurred in loop_function") return (False, None) duration, sr_nu, enc_nu, samples = data # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # concatenate new samples self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration output_file.add_samples(samples, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log( [u"Quitting after reached duration %.3f", current_time]) break # minimize memory self.log(u"Minimizing memory...") output_file.minimize_memory() self.log(u"Minimizing memory... done") # if backwards, we need to reverse the audio samples again if backwards: self.log(u"Reversing audio samples...") output_file.reverse() self.log(u"Reversing audio samples... done") # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) # return output if backwards: self.log_warn( u"Please note that anchor time values do not make sense since backwards=True" ) self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine using multiple generic function... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_subprocess(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple fragments via ``subprocess``. :rtype: tuple (result, (anchors, current_time, num_chars)) """ def synthesize_and_clean(text, voice_code): """ Synthesize a single fragment via subprocess, and immediately remove the temporary file. :rtype: tuple (duration, sample_rate, encoding, samples) """ self.log(u"Synthesizing text...") handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result, data = self._synthesize_single_subprocess( text=(text + u" "), voice_code=voice_code, output_file_path=tmp_destination ) self.log([u"Removing temporary file '%s'", tmp_destination]) gf.delete_file(handler, tmp_destination) self.log(u"Synthesizing text... done") return data self.log(u"Calling TTS engine via subprocess...") try: # get sample rate and encoding du_nu, sample_rate, encoding, da_nu = synthesize_and_clean( text=u"Dummy text to get sample_rate", voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE) ) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = encoding output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, samples = synthesize_and_clean( text=fragment.filtered_text, voice_code=voice_code ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # concatenate new samples self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration output_file.add_samples(samples, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([u"Quitting after reached duration %.3f", current_time]) break # minimize memory self.log(u"Minimizing memory...") output_file.minimize_memory() self.log(u"Minimizing memory... done") # if backwards, we need to reverse the audio samples again if backwards: self.log(u"Reversing audio samples...") output_file.reverse() self.log(u"Reversing audio samples... done") # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None) return (False, None) # return output if backwards: self.log_warn(u"Please note that anchor time values do not make sense since backwards=True") self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via subprocess... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # TODO in the Speect Python API I was not able to find a way # to generate the wave incrementally # so I essentially copy the subprocess call mechanism: # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # get sample rate and encoding du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper( text=u"Dummy text to get sample_rate", voice_code=self.DEFAULT_LANGUAGE) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = encoding output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code # # NOTE since voice_code is actually ignored # in _synthesize_single_helper(), # the value of voice_code is irrelevant # # however, in general you need to apply # the _language_to_voice_code() function that maps # the text language to a voice code # # here we apply the _language_to_voice_code() defined in super() # that sets voice_code = fragment.language # voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code) # store for later output anchors.append( [current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([ u"Quitting after reached duration %.3f", current_time ]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc( u"An unexpected error occurred while calling TTS engine via Python", exc, False, None) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # TODO in the Speect Python API I was not able to find a way # to generate the wave incrementally # so I essentially copy the subprocess call mechanism: # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # get sample rate and encoding du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper( text=u"Dummy text to get sample_rate", voice_code=self.DEFAULT_LANGUAGE ) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = encoding output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code # # NOTE since voice_code is actually ignored # in _synthesize_single_helper(), # the value of voice_code is irrelevant # # however, in general you need to apply # the _language_to_voice_code() function that maps # the text language to a voice code # # here we apply the _language_to_voice_code() defined in super() # that sets voice_code = fragment.language # voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([u"Quitting after reached duration %.3f", current_time]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc(u"An unexpected error occurred while calling TTS engine via Python", exc, False, None) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))