def _synthesize(self): """ Synthesize text into a ``wav`` file. Return a quadruple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file 4. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` """ self._log("Synthesizing text") handler = None path = None anchors = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) self._log("Creating Synthesizer object") synt = Synthesizer(logger=self.logger) self._log("Synthesizing...") result = synt.synthesize(self.task.text_file, path) anchors = result[0] self._log("Synthesizing... done") self._log("Synthesizing text: succeeded") return (True, handler, path, anchors) except Exception as e: self._log("Synthesizing text: failed") self._log(["Message: %s", str(e)]) return (False, handler, path, anchors)
def perform(self, path, logger=None, quit_after=None, backwards=False): handler, output_file_path = tempfile.mkstemp(suffix=".wav") tfl = TextFile(get_abs_path(path), TextFileFormat.PLAIN) tfl.set_language(Language.EN) synth = Synthesizer(logger=logger) result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards) delete_file(handler, output_file_path) return result
def inner(c_ext, cew_subprocess): handler, output_file_path = gf.tmp_file(suffix=".wav") tfl = TextFile(gf.absolute_path(path, __file__), TextFileFormat.PLAIN) tfl.set_language(Language.ENG) synth = Synthesizer(logger=logger) synth.rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext synth.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards) gf.delete_file(handler, output_file_path) self.assertEqual(len(result[0]), expected) if expected2 is not None: self.assertAlmostEqual(result[1], expected2, places=0)
def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return: 1. handler of the generated wave file 2. path of the generated wave file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. if the synthesizer produced a PCM16 mono WAVE file :param synthesizer: the synthesizer to use :type synthesizer: :class:`~aeneas.synthesizer.Synthesizer` :rtype: tuple (handler, string, list) """ synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) handler, path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = synthesizer.synthesize(text_file, path) anchors = result[0] return (handler, path, anchors, synthesizer.output_is_mono_wave)
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize(self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log( ["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) admissible_intervals = [ x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length)) ] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log( " Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log([ "Evaluating interval starting at %d == %.3f ", start_index, start_time ]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log( " Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin( last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([ " Interval start: %d == %.6f", start_index, start_time ]) self._log( [" Interval end: %d == %.6f", end_index, end_time]) self._log([ " Candidate start: %d == %.6f", start_index, start_time ]) self._log([ " Candidate end: %d == %.6f", candidate_end_index, candidate_end_time ]) self._log([ " Candidate length: %d == %.6f", candidate_length_index, candidate_length_time ]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
def perform_command(self): """ Perform command and return the appropriate exit code. :rtype: int """ if len(self.actual_arguments) < 4: return self.print_help() text_format = gf.safe_unicode(self.actual_arguments[0]) if text_format == u"list": text = gf.safe_unicode(self.actual_arguments[1]) elif text_format in TextFileFormat.ALLOWED_VALUES: text = self.actual_arguments[1] if not self.check_input_file(text): return self.ERROR_EXIT_CODE else: return self.print_help() l1_id_regex = self.has_option_with_value(u"--l1-id-regex") l2_id_regex = self.has_option_with_value(u"--l2-id-regex") l3_id_regex = self.has_option_with_value(u"--l3-id-regex") id_regex = self.has_option_with_value(u"--id-regex") class_regex = self.has_option_with_value(u"--class-regex") sort = self.has_option_with_value(u"--sort") backwards = self.has_option([u"-b", u"--backwards"]) quit_after = gf.safe_float(self.has_option_with_value(u"--quit-after"), None) start_fragment = gf.safe_int(self.has_option_with_value(u"--start"), None) end_fragment = gf.safe_int(self.has_option_with_value(u"--end"), None) parameters = { gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex, gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex, gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort, } if (text_format == TextFileFormat.MUNPARSED) and ( (l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)): self.print_error( u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format" ) return self.ERROR_EXIT_CODE if (text_format == TextFileFormat.UNPARSED) and ( id_regex is None) and (class_regex is None): self.print_error( u"You must specify --id-regex and/or --class-regex for unparsed format" ) return self.ERROR_EXIT_CODE language = gf.safe_unicode(self.actual_arguments[2]) output_file_path = self.actual_arguments[3] if not self.check_output_file(output_file_path): return self.ERROR_EXIT_CODE text_file = self.get_text_file(text_format, text, parameters) if text_file is None: self.print_error( u"Unable to build a TextFile from the given parameters") return self.ERROR_EXIT_CODE elif len(text_file) == 0: self.print_error(u"No text fragments found") return self.ERROR_EXIT_CODE text_file.set_language(language) self.print_info(u"Read input text with %d fragments" % (len(text_file))) if start_fragment is not None: self.print_info(u"Slicing from index %d" % (start_fragment)) if end_fragment is not None: self.print_info(u"Slicing to index %d" % (end_fragment)) text_slice = text_file.get_slice(start_fragment, end_fragment) self.print_info(u"Synthesizing %d fragments" % (len(text_slice))) if quit_after is not None: self.print_info(u"Stop synthesizing upon reaching %.3f seconds" % (quit_after)) try: synt = Synthesizer(rconf=self.rconf, logger=self.logger) synt.synthesize(text_slice, output_file_path, quit_after=quit_after, backwards=backwards) self.print_success(u"Created file '%s'" % output_file_path) synt.clear_cache() return self.NO_ERROR_EXIT_CODE except ImportError as exc: tts = self.rconf[RuntimeConfiguration.TTS] if tts == Synthesizer.AWS: self.print_error( u"You need to install Python module boto3 to use the AWS Polly TTS API wrapper. Run:" ) self.print_error(u"$ pip install boto3") self.print_error(u"or, to install for all users:") self.print_error(u"$ sudo pip install boto3") elif tts == Synthesizer.NUANCE: self.print_error( u"You need to install Python module requests to use the Nuance TTS API wrapper. Run:" ) self.print_error(u"$ pip install requests") self.print_error(u"or, to install for all users:") self.print_error(u"$ sudo pip install requests") else: self.print_error( u"An unexpected error occurred while synthesizing text:") self.print_error(u"%s" % exc) except Exception as exc: self.print_error( u"An unexpected error occurred while synthesizing text:") self.print_error(u"%s" % exc) return self.ERROR_EXIT_CODE
def test_synthesize_path_not_writeable(self): tfl = TextFile() synth = Synthesizer() with self.assertRaises(OSError): synth.synthesize(tfl, self.PATH_NOT_WRITEABLE)
def test_synthesize_invalid_text_file(self): synth = Synthesizer() with self.assertRaises(TypeError): synth.synthesize("foo", self.PATH_NOT_WRITEABLE)
def test_synthesize_none(self): synth = Synthesizer() with self.assertRaises(TypeError): synth.synthesize(None, self.PATH_NOT_WRITEABLE)
def test_clear_cache(self): synth = Synthesizer() synth.clear_cache()
def _set_synthesizer(self): """ Create synthesizer """ self.log(u"Setting synthesizer...") self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) self.log(u"Setting synthesizer... done")
def _detect(self, min_length, max_length, tail=False): """ Detect the head or tail within ``min_length`` and ``max_length`` duration. If detecting the tail, the real wave MFCC and the query are reversed so that the tail detection problem reduces to a head detection problem. Return the duration of the head or tail, in seconds. :param min_length: estimated minimum length :type min_length: :class:`~aeneas.timevalue.TimeValue` :param max_length: estimated maximum length :type max_length: :class:`~aeneas.timevalue.TimeValue` :rtype: :class:`~aeneas.timevalue.TimeValue` :raises: TypeError: if one of the parameters is not ``None`` or a number :raises: ValueError: if one of the parameters is negative """ def _sanitize(value, default, name): if value is None: value = default try: value = TimeValue(value) except (TypeError, ValueError, InvalidOperation) as exc: self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError) if value < 0: self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError) return value min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length") max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length") mws = self.rconf.mws min_length_frames = int(min_length / mws) max_length_frames = int(max_length / mws) self.log([u"MFCC window shift s: %.3f", mws]) self.log([u"Min start length s: %.3f", min_length]) self.log([u"Min start length frames: %d", min_length_frames]) self.log([u"Max start length s: %.3f", max_length]) self.log([u"Max start length frames: %d", max_length_frames]) self.log([u"Tail?: %s", str(tail)]) self.log(u"Synthesizing query...") synt_duration = max_length * self.QUERY_FACTOR self.log([u"Synthesizing at least %.3f seconds", synt_duration]) tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) synt = Synthesizer(rconf=self.rconf, logger=self.logger) anchors, total_time, synthesized_chars = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=tail ) self.log(u"Synthesizing query... done") self.log(u"Extracting MFCCs for query...") query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger) self.log(u"Extracting MFCCs for query... done") self.log(u"Cleaning up...") gf.delete_file(tmp_handler, tmp_file_path) self.log(u"Cleaning up... done") search_window = max_length * self.AUDIO_FACTOR search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length) self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length]) self.log([u"Real MFCC length (frames): %d", self.real_wave_mfcc.all_length]) self.log([u"Search window end (s): %.3f", search_window]) self.log([u"Search window end (frames): %d", search_window_end]) if tail: self.log(u"Tail => reversing real_wave_mfcc and query_mfcc") self.real_wave_mfcc.reverse() query_mfcc.reverse() # NOTE: VAD will be run here, if not done before speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False) if len(speech_intervals) < 1: self.log(u"No speech intervals, hence no start found") if tail: self.real_wave_mfcc.reverse() return TimeValue("0.000") # generate a list of begin indices search_end = None candidates_begin = [] for interval in speech_intervals: if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames): candidates_begin.append(interval[0]) search_end = interval[1] if search_end >= search_window_end: break # for each begin index, compute the acm cost # to match the query # note that we take the min over the last column of the acm # meaning that we allow to match the entire query wave # against a portion of the real wave candidates = [] for candidate_begin in candidates_begin: self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws]) try: rwm = AudioFileMFCC( mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end], rconf=self.rconf, logger=self.logger ) dtw = DTWAligner( real_wave_mfcc=rwm, synt_wave_mfcc=query_mfcc, rconf=self.rconf, logger=self.logger ) acm = dtw.compute_accumulated_cost_matrix() last_column = acm[:, -1] min_value = numpy.min(last_column) min_index = numpy.argmin(last_column) self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws]) self.log([u" Min value: %.6f", min_value]) self.log([u" Min index: %d == %.3f", min_index, min_index * mws]) candidates.append((min_value, candidate_begin, min_index)) except Exception as exc: self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None) # reverse again the real wave if tail: self.log(u"Tail => reversing real_wave_mfcc again") self.real_wave_mfcc.reverse() # return if len(candidates) < 1: self.log(u"No candidates found") return TimeValue("0.000") self.log(u"Candidates:") for candidate in candidates: self.log([u" Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]]) best = sorted(candidates)[0][1] self.log([u"Best candidate: %d == %.3f", best, best * mws]) return best * mws
def main(): """ Entry point """ if len(sys.argv) < 5: usage() return language = sys.argv[1] text_file_path = sys.argv[2] text_format = sys.argv[3] audio_file_path = sys.argv[-1] backwards = False quit_after = None parameters = {} for i in range(4, len(sys.argv) - 1): args = sys.argv[i].split("=") if len(args) == 1: backwards = (args[0] in ["b", "-b", "backwards", "--backwards"]) if len(args) == 2: key, value = args if key == "id_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value if key == "class_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value if key == "sort": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value if (key == "start") or (key == "end"): try: parameters[key] = int(value) except: pass if key == "quit_after": quit_after = float(value) if text_format == "list": text_file = TextFile() text_file.read_from_list(text_file_path.split("|")) else: text_file = TextFile(text_file_path, text_format, parameters) text_file.set_language(language) start_fragment = None if "start" in parameters: start_fragment = parameters["start"] end_fragment = None if "end" in parameters: end_fragment = parameters["end"] print "[INFO] Read input text file with %d fragments" % (len(text_file)) if start_fragment is not None: print "[INFO] Slicing from index %d" % (start_fragment) if end_fragment is not None: print "[INFO] Slicing to index %d" % (end_fragment) text_slice = text_file.get_slice(start_fragment, end_fragment) print "[INFO] Synthesizing %d fragments" % (len(text_slice)) if quit_after is not None: print "[INFO] Stop synthesizing after reaching %.3f seconds" % ( quit_after) if backwards: print "[INFO] Synthesizing backwards" synt = Synthesizer() synt.synthesize(text_slice, audio_file_path, quit_after, backwards) print "[INFO] Created file '%s'" % audio_file_path
def build_sync_map( text_paths, audio_paths, tmp_dir, sync_map_text_path_prefix, sync_map_audio_path_prefix, skip_penalty, radius ): """ This is an algorithm for building a sync map. It synthesizes text and then aligns synthesized audio with the recorded audio using a variation of the DTW (Dynamic Time Warping) algorithm. The main features of this algorithm are: 1) It can handle structural differences in the beginning and in the end of files. 2) It finds an approximation to an optimal warping path in linear time and space using FastDTW approach. Note that while the algorithm does not require one-to-one correspondance between text and audio files (i.e. the splitting can be done differently), the quality of the result is sensitive to the choice of skip_penalty and radius parameters, so it is recommended to have such a correspondance. Alignment details: Synthesized and recorded audio are represented as sequences of MFCC frames. These sequences are aligned using variation of the DTW algorithm. In contrast to the classic DTW, this algorithms can be used to align sequences with structural differences in the beginning or in the end. Steps to build a sync map: 1) Synthesize text file and produce a list of anchors. Each anchor represents the start of the corresponding text fragment in a synthesized audio. 2) Get sequences of MFCC frames of synthesized and recorded audio. 3) Get their warping path by calling the alignment algorithm. 4) Check whether the extra content is found, calculate mapping boundaries. 5) Map anchors inside the boundaries to the recorded MFCC sequence using warping path from step 3. 6) Start all over again considering: If there is an extra content in the end of synthesized sequence, align it with the next audio file. If there is an extra content in the end of recorded sequence, align it with the next text file. If both sequences have extra content in the end, align text tail with the next audio file. If none of the above, align next text and audio files. """ synthesizer = Synthesizer() parse_parameters = {'is_text_unparsed_id_regex': 'f[0-9]+'} sync_map = {} process_next_text = True process_next_audio = True while True: if process_next_text: try: text_path = next(text_paths) except StopIteration: break text_name = get_name_from_path(text_path) output_text_name = os.path.join(sync_map_text_path_prefix, text_name) textfile = TextFile(text_path, file_format=TextFileFormat.UNPARSED, parameters=parse_parameters) textfile.set_language(Language.ENG) text_wav_path = os.path.join(tmp_dir, f'{drop_extension(text_name)}_text.wav') sync_map[output_text_name] = {} # Produce synthesized audio, get anchors anchors,_,_ = synthesizer.synthesize(textfile, text_wav_path) # Get fragments, convert anchors timings to the frames indicies fragments = [a[1] for a in anchors] anchors = np.array([int(a[0] / TimeValue('0.040')) for a in anchors]) # MFCC frames sequence memory layout is a n x l 2D array, # where n - number of frames and l - number of MFFCs # i.e it is c-contiguous, but after dropping the first coefficient it siezes to be c-contiguous. # Should decide whether to make a copy or to work around the first coefficient. text_mfcc_sequence = np.ascontiguousarray( AudioFileMFCC(text_wav_path).all_mfcc.T[:, 1:] ) if process_next_audio: try: audio_path = next(audio_paths) except StopIteration: break audio_name = get_name_from_path(audio_path) output_audio_name = os.path.join(sync_map_audio_path_prefix, audio_name) audio_wav_path = os.path.join(tmp_dir, f'{drop_extension(audio_name)}_audio.wav') subprocess.run(['ffmpeg', '-n', '-i', audio_path, audio_wav_path]) audio_mfcc_sequence = np.ascontiguousarray( AudioFileMFCC(audio_wav_path).all_mfcc.T[:, 1:] ) # Keep track to calculate frames timings audio_start_frame = 0 n = len(text_mfcc_sequence) m = len(audio_mfcc_sequence) _, path = c_FastDTWBD(text_mfcc_sequence, audio_mfcc_sequence, skip_penalty, radius=radius) if len(path) == 0: print( f'No match between {text_name} and {audio_name}. ' f'Alignment is terminated. ' f'Adjust skip_penalty or input files.' ) return {} # Project path to the text and audio sequences text_path_frames = path[:,0] audio_path_frames = path[:,1] last_matched_audio_frame = audio_path_frames[-1] # Find first and last matched frames first_matched_text_frame = text_path_frames[0] last_matched_text_frame = text_path_frames[-1] # Map only those fragments that intersect matched frames anchors_boundary_indices = np.searchsorted( anchors, [first_matched_text_frame, last_matched_text_frame] ) map_anchors_from = max(anchors_boundary_indices[0] - 1, 0) map_anchors_to = anchors_boundary_indices[1] anchors_to_map = anchors[map_anchors_from:map_anchors_to] fragments_to_map = fragments[map_anchors_from:map_anchors_to] # Get anchors indicies in the path projection to the text sequence text_path_anchor_indices = np.searchsorted(text_path_frames, anchors_to_map) # Get anchors' frames in audio sequence, calculate their timings anchors_matched_frames = audio_path_frames[text_path_anchor_indices] timings = (np.append(anchors_matched_frames, audio_path_frames[-1]) + audio_start_frame) * 0.040 # Map fragment_ids to timings, update mapping of the current text file fragment_map = { f: { 'audio_file': output_audio_name, 'begin_time': time_to_str(bt), 'end_time': time_to_str(et) } for f, bt, et in zip(fragments_to_map, timings[:-1], timings[1:]) } sync_map[output_text_name].update(fragment_map) # Decide whether to process next file or to align the tail of the current one if map_anchors_to == len(anchors): # Process next text if no fragments are left process_next_text = True else: # Otherwise align tail of the current text process_next_text = False text_mfcc_sequence = text_mfcc_sequence[last_matched_text_frame:] fragments = fragments[map_anchors_to:] anchors = anchors[map_anchors_to:] - last_matched_text_frame if last_matched_audio_frame == m - 1 or not process_next_text: # Process next audio if there are no unmatched audio frames in the tail # or there are more text fragments to map, i.e. # we choose to process next audio if we cannot decide. # This strategy is correct if there are no extra fragments in the end. process_next_audio = True else: # Otherwise align tail of the current audio process_next_audio = False audio_mfcc_sequence = audio_mfcc_sequence[last_matched_audio_frame:] audio_start_frame += last_matched_audio_frame return sync_map