def main(): """ Entry point """ if len(sys.argv) < 3: usage() return file_path = sys.argv[1] text_format = sys.argv[2] parameters = {} for i in range(3, len(sys.argv)): args = sys.argv[i].split("=") if len(args) == 2: key, value = args if key == "id_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value if key == "class_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value if key == "sort": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value if text_format == "list": text_file = TextFile() text_file.read_from_list(file_path.split("|")) else: text_file = TextFile(file_path, text_format, parameters) print str(text_file)
def get_text_file(self, text_format, text, parameters): if text_format == u"list": text_file = TextFile(logger=self.logger) text_file.read_from_list(text.split(u"|")) return text_file else: if text_format not in TextFileFormat.ALLOWED_VALUES: self.print_error(u"File format '%s' is not allowed" % (text_format)) self.print_error(u"Allowed text file formats: %s" % (" ".join(TextFileFormat.ALLOWED_VALUES))) return None try: return TextFile(text, text_format, parameters, logger=self.logger) except OSError: self.print_error(u"Cannot read file '%s'" % (text)) return None
def total_FA(soundfile, mylines, myhead, mytail, config=None): """Runs Aeneas as a library. This function isn't in use, currently, as we haven't managed to get reliable results in this way.""" # create Task object if config is None: config_string = ( u"task_language=nor|is_text_type=plain|os_task_file_format=json|is_audio_file_head_length=%s|is_audio_file_tail_length=%s" % (myhead, mytail)) print(config_string) else: config_string = ( u"task_language=nor|is_text_type=plain|os_task_file_format=json|is_audio_file_head_length=%s|is_audio_file_tail_length=%s|%s" % (myhead, mytail, config)) print(config_string) task = Task(config_string=config_string) print(task) task.audio_file_path_absolute = soundfile textfile = TextFile() print(textfile) #task.sync_map_file_path_absolute = outfile for identifier, frag_text in mylines: textfile.add_fragment( TextFragment(identifier, Language.NOR, frag_text, frag_text)) task.text_file = textfile print(len(task.text_file)) ExecuteTask(task).execute() syncmaplist = task.sync_map.fragments return syncmaplist
def check_espeak(cls): """ Check whether ``espeak`` can be called. Return ``True`` on failure and ``False`` on success. :rtype: bool """ try: from aeneas.textfile import TextFile from aeneas.textfile import TextFragment from aeneas.ttswrappers.espeakttswrapper import ESPEAKTTSWrapper text = u"From fairest creatures we desire increase," text_file = TextFile() text_file.add_fragment(TextFragment(language=u"eng", lines=[text], filtered_lines=[text])) handler, output_file_path = gf.tmp_file(suffix=u".wav") ESPEAKTTSWrapper().synthesize_multiple(text_file, output_file_path) gf.delete_file(handler, output_file_path) gf.print_success(u"espeak OK") return False except: pass gf.print_error(u"espeak ERROR") gf.print_info(u" Please make sure you have espeak installed correctly") gf.print_info(u" and that its path is in your PATH environment variable") gf.print_info(u" You might also want to check that the espeak-data directory") gf.print_info(u" is set up correctly, for example, it has the correct permissions") return True
def test_read_from_list_with_ids(self): tfl = TextFile() text_list = [["a1", "fragment 1"], ["b2", "fragment 2"], ["c3", "fragment 3"], ["d4", "fragment 4"], ["e5", "fragment 5"]] tfl.read_from_list_with_ids(text_list) self.assertEqual(len(tfl), 5)
def tfl(self, frags): tfl = TextFile() for language, lines in frags: tfl.add_fragment( TextFragment(language=language, lines=lines, filtered_lines=lines)) return tfl
def test_read_from_list_with_ids(self): tfl = TextFile() text_list = [(u"a1", u"fragment 1"), (u"b2", u"fragment 2"), (u"c3", u"fragment 3"), (u"d4", u"fragment 4"), (u"e5", u"fragment 5")] tfl.read_from_list_with_ids(text_list) self.assertEqual(len(tfl), 5) self.assertEqual(tfl.chars, 50)
def perform(self, path, logger=None, quit_after=None, backwards=False): handler, output_file_path = tempfile.mkstemp(suffix=".wav") tfl = TextFile(get_abs_path(path), TextFileFormat.PLAIN) tfl.set_language(Language.EN) synth = Synthesizer(logger=logger) result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards) delete_file(handler, output_file_path) return result
def test_read_from_list(self): tfl = TextFile() text_list = [ "fragment 1", "fragment 2", "fragment 3", "fragment 4", "fragment 5" ] tfl.read_from_list(text_list) self.assertEqual(len(tfl), 5)
def load(self, input_file_path=PLAIN_FILE_PATH, fmt=TextFileFormat.PLAIN, expected_length=15, parameters=None): tfl = TextFile(get_abs_path(input_file_path), fmt, parameters) self.assertEqual(len(tfl), expected_length) return tfl
def test_append_fragment_multiple(self): tfl = TextFile() self.assertEqual(len(tfl), 0) tfl.append_fragment(TextFragment("a1", Language.EN, "fragment 1")) self.assertEqual(len(tfl), 1) tfl.append_fragment(TextFragment("a2", Language.EN, "fragment 2")) self.assertEqual(len(tfl), 2) tfl.append_fragment(TextFragment("a3", Language.EN, "fragment 3")) self.assertEqual(len(tfl), 3)
def test_add_fragment_multiple(self): tfl = TextFile() self.assertEqual(len(tfl), 0) tfl.add_fragment(TextFragment(u"a1", Language.ENG, [u"fragment 1"])) self.assertEqual(len(tfl), 1) tfl.add_fragment(TextFragment(u"a2", Language.ENG, [u"fragment 2"])) self.assertEqual(len(tfl), 2) tfl.add_fragment(TextFragment(u"a3", Language.ENG, [u"fragment 3"])) self.assertEqual(len(tfl), 3) self.assertEqual(tfl.chars, 30)
def inner(c_ext, cew_subprocess): handler, output_file_path = gf.tmp_file(suffix=".wav") tfl = TextFile(gf.absolute_path(path, __file__), TextFileFormat.PLAIN) tfl.set_language(Language.ENG) synth = Synthesizer(logger=logger) synth.rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext synth.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards) gf.delete_file(handler, output_file_path) self.assertEqual(len(result[0]), expected) if expected2 is not None: self.assertAlmostEqual(result[1], expected2, places=0)
def _populate_text_file(self): """ Create the ``self.text_file`` object by reading the text file at ``self.text_file_path_absolute``. """ self.log(u"Populate text file...") if ((self.text_file_path_absolute is not None) and (self.configuration["language"] is not None)): # the following values might be None parameters = { gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX: self.configuration["i_t_ignore_regex"], gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP: self.configuration["i_t_transliterate_map"], gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR: self.configuration["i_t_mplain_word_separator"], gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: self.configuration["i_t_munparsed_l1_id_regex"], gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: self.configuration["i_t_munparsed_l2_id_regex"], gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: self.configuration["i_t_munparsed_l3_id_regex"], gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: self.configuration["i_t_unparsed_class_regex"], gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: self.configuration["i_t_unparsed_id_regex"], gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: self.configuration["i_t_unparsed_id_sort"], gc.PPN_TASK_OS_FILE_ID_REGEX: self.configuration["o_id_regex"] } self.text_file = TextFile( file_path=self.text_file_path_absolute, file_format=self.configuration["i_t_format"], parameters=parameters, logger=self.logger) self.text_file.set_language(self.configuration["language"]) else: self.log(u"text_file_path_absolute and/or language is None") self.log(u"Populate text file... done")
def _populate_text_file(self): """ Create the ``self.text_file`` object by reading the text file at ``self.text_file_path_absolute``. """ if ((self.text_file_path_absolute is not None) and (self.configuration.language is not None)): parameters = dict() parameters[ gc. PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX] = self.configuration.is_text_unparsed_class_regex parameters[ gc. PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX] = self.configuration.is_text_unparsed_id_regex parameters[ gc. PPN_TASK_IS_TEXT_UNPARSED_ID_SORT] = self.configuration.is_text_unparsed_id_sort self.text_file = TextFile( file_path=self.text_file_path_absolute, file_format=self.configuration.is_text_file_format, parameters=parameters, logger=None) self.text_file.set_language(self.configuration.language)
def test_synthesize_path_not_writeable(self): tfl = TextFile() synth = Synthesizer() with self.assertRaises(OSError): synth.synthesize(tfl, self.PATH_NOT_WRITEABLE)
def main(): """ Entry point """ if len(sys.argv) < 5: usage() return language = sys.argv[1] text_file_path = sys.argv[2] text_format = sys.argv[3] audio_file_path = sys.argv[-1] backwards = False quit_after = None parameters = {} for i in range(4, len(sys.argv) - 1): args = sys.argv[i].split("=") if len(args) == 1: backwards = (args[0] in ["b", "-b", "backwards", "--backwards"]) if len(args) == 2: key, value = args if key == "id_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value if key == "class_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value if key == "sort": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value if (key == "start") or (key == "end"): try: parameters[key] = int(value) except: pass if key == "quit_after": quit_after = float(value) if text_format == "list": text_file = TextFile() text_file.read_from_list(text_file_path.split("|")) else: text_file = TextFile(text_file_path, text_format, parameters) text_file.set_language(language) start_fragment = None if "start" in parameters: start_fragment = parameters["start"] end_fragment = None if "end" in parameters: end_fragment = parameters["end"] print "[INFO] Read input text file with %d fragments" % (len(text_file)) if start_fragment is not None: print "[INFO] Slicing from index %d" % (start_fragment) if end_fragment is not None: print "[INFO] Slicing to index %d" % (end_fragment) text_slice = text_file.get_slice(start_fragment, end_fragment) print "[INFO] Synthesizing %d fragments" % (len(text_slice)) if quit_after is not None: print "[INFO] Stop synthesizing after reaching %.3f seconds" % ( quit_after) if backwards: print "[INFO] Synthesizing backwards" synt = Synthesizer() synt.synthesize(text_slice, audio_file_path, quit_after, backwards) print "[INFO] Created file '%s'" % audio_file_path
def test_set_language_on_empty(self): tfl = TextFile() self.assertEqual(len(tfl), 0) tfl.set_language(Language.EN) self.assertEqual(len(tfl), 0)
def test_add_fragment(self): tfl = TextFile() self.assertEqual(len(tfl), 0) tfl.add_fragment(TextFragment(u"a1", Language.ENG, [u"fragment 1"])) self.assertEqual(len(tfl), 1) self.assertEqual(tfl.chars, 10)
def load(self): audio_file_mfcc = AudioFileMFCC(self.AUDIO_FILE) text_file = TextFile(self.TEXT_FILE, file_format=TextFileFormat.PLAIN) text_file.set_language(Language.ENG) return SD(audio_file_mfcc, text_file)
def test_invalid_add_fragment(self): tfl = TextFile() with self.assertRaises(TypeError): tfl.add_fragment("foo")
def test_empty_fragments(self): tfl = TextFile() self.assertEqual(len(tfl), 0)
def main(): """ Entry point """ if len(sys.argv) < 5: usage() return language = sys.argv[1] text_file_path = sys.argv[2] text_format = sys.argv[3] audio_file_path = sys.argv[-1] verbose = False parameters = {} for i in range(4, len(sys.argv) - 1): args = sys.argv[i].split("=") if len(args) == 1: verbose = (args[0] in ["v", "-v", "verbose", "--verbose"]) if len(args) == 2: key, value = args if key == "id_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value if key == "class_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value if key == "sort": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value if key == "min_head_length": parameters["min_head_length"] = float(value) if key == "max_head_length": parameters["max_head_length"] = float(value) if key == "min_tail_length": parameters["min_head_length"] = float(value) if key == "max_tail_length": parameters["max_tail_length"] = float(value) if not gf.can_run_c_extension(): print "[WARN] Unable to load Python C Extensions" print "[WARN] Running the slower pure Python code" print "[WARN] See the README file for directions to compile the Python C Extensions" logger = Logger(tee=verbose) print "[INFO] Reading audio..." tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) converter = FFMPEGWrapper(logger=logger) converter.convert(audio_file_path, tmp_file_path) audio_file = AudioFile(tmp_file_path) print "[INFO] Reading audio... done" print "[INFO] Reading text..." if text_format == "list": text_file = TextFile() text_file.read_from_list(text_file_path.split("|")) else: text_file = TextFile(text_file_path, text_format, parameters) text_file.set_language(language) print "[INFO] Reading text... done" print "[INFO] Detecting audio interval..." sd = SD(audio_file, text_file, logger=logger) min_head_length = gc.SD_MIN_HEAD_LENGTH if "min_head_length" in parameters: min_head_length = parameters["min_head_length"] max_head_length = gc.SD_MAX_HEAD_LENGTH if "max_head_length" in parameters: max_head_length = parameters["max_head_length"] min_tail_length = gc.SD_MIN_TAIL_LENGTH if "min_tail_length" in parameters: min_tail_length = parameters["min_tail_length"] max_tail_length = gc.SD_MAX_TAIL_LENGTH if "max_tail_length" in parameters: max_tail_length = parameters["max_tail_length"] start, end = sd.detect_interval(min_head_length, max_head_length, min_tail_length, max_tail_length) zero = 0 audio_len = audio_file.audio_length head_len = start text_len = end - start tail_len = audio_len - end print "[INFO] Detecting audio interval... done" print "[INFO] " print "[INFO] Head: %.3f %.3f (%.3f)" % (zero, start, head_len) print "[INFO] Text: %.3f %.3f (%.3f)" % (start, end, text_len) print "[INFO] Tail: %.3f %.3f (%.3f)" % (end, audio_len, tail_len) print "[INFO] " zero_h = gf.time_to_hhmmssmmm(0) start_h = gf.time_to_hhmmssmmm(start) end_h = gf.time_to_hhmmssmmm(end) audio_len_h = gf.time_to_hhmmssmmm(audio_len) head_len_h = gf.time_to_hhmmssmmm(head_len) text_len_h = gf.time_to_hhmmssmmm(text_len) tail_len_h = gf.time_to_hhmmssmmm(tail_len) print "[INFO] Head: %s %s (%s)" % (zero_h, start_h, head_len_h) print "[INFO] Text: %s %s (%s)" % (start_h, end_h, text_len_h) print "[INFO] Tail: %s %s (%s)" % (end_h, audio_len_h, tail_len_h) #print "[INFO] Cleaning up..." cleanup(tmp_handler, tmp_file_path)
def test_append_fragment(self): tfl = TextFile() self.assertEqual(len(tfl), 0) tfl.append_fragment(TextFragment("a1", Language.EN, "fragment 1")) self.assertEqual(len(tfl), 1)
def test_file_path_not_existing(self): with self.assertRaises(OSError): tfl = TextFile(file_path=self.NOT_EXISTING_PATH)
def test_invalid_format(self): with self.assertRaises(ValueError): tfl = TextFile(file_format="foo")
def test_constructor(self): tfl = TextFile() self.assertEqual(len(tfl), 0)
def test_invalid_parameters(self): with self.assertRaises(TypeError): tfl = TextFile(parameters=["foo"])
def test_no_fragments(self): tfl = TextFile() tfl.set_language(self.TTS_LANGUAGE) self.synthesize(tfl, expected_exc=ValueError)
def build_sync_map( text_paths, audio_paths, tmp_dir, sync_map_text_path_prefix, sync_map_audio_path_prefix, skip_penalty, radius ): """ This is an algorithm for building a sync map. It synthesizes text and then aligns synthesized audio with the recorded audio using a variation of the DTW (Dynamic Time Warping) algorithm. The main features of this algorithm are: 1) It can handle structural differences in the beginning and in the end of files. 2) It finds an approximation to an optimal warping path in linear time and space using FastDTW approach. Note that while the algorithm does not require one-to-one correspondance between text and audio files (i.e. the splitting can be done differently), the quality of the result is sensitive to the choice of skip_penalty and radius parameters, so it is recommended to have such a correspondance. Alignment details: Synthesized and recorded audio are represented as sequences of MFCC frames. These sequences are aligned using variation of the DTW algorithm. In contrast to the classic DTW, this algorithms can be used to align sequences with structural differences in the beginning or in the end. Steps to build a sync map: 1) Synthesize text file and produce a list of anchors. Each anchor represents the start of the corresponding text fragment in a synthesized audio. 2) Get sequences of MFCC frames of synthesized and recorded audio. 3) Get their warping path by calling the alignment algorithm. 4) Check whether the extra content is found, calculate mapping boundaries. 5) Map anchors inside the boundaries to the recorded MFCC sequence using warping path from step 3. 6) Start all over again considering: If there is an extra content in the end of synthesized sequence, align it with the next audio file. If there is an extra content in the end of recorded sequence, align it with the next text file. If both sequences have extra content in the end, align text tail with the next audio file. If none of the above, align next text and audio files. """ synthesizer = Synthesizer() parse_parameters = {'is_text_unparsed_id_regex': 'f[0-9]+'} sync_map = {} process_next_text = True process_next_audio = True while True: if process_next_text: try: text_path = next(text_paths) except StopIteration: break text_name = get_name_from_path(text_path) output_text_name = os.path.join(sync_map_text_path_prefix, text_name) textfile = TextFile(text_path, file_format=TextFileFormat.UNPARSED, parameters=parse_parameters) textfile.set_language(Language.ENG) text_wav_path = os.path.join(tmp_dir, f'{drop_extension(text_name)}_text.wav') sync_map[output_text_name] = {} # Produce synthesized audio, get anchors anchors,_,_ = synthesizer.synthesize(textfile, text_wav_path) # Get fragments, convert anchors timings to the frames indicies fragments = [a[1] for a in anchors] anchors = np.array([int(a[0] / TimeValue('0.040')) for a in anchors]) # MFCC frames sequence memory layout is a n x l 2D array, # where n - number of frames and l - number of MFFCs # i.e it is c-contiguous, but after dropping the first coefficient it siezes to be c-contiguous. # Should decide whether to make a copy or to work around the first coefficient. text_mfcc_sequence = np.ascontiguousarray( AudioFileMFCC(text_wav_path).all_mfcc.T[:, 1:] ) if process_next_audio: try: audio_path = next(audio_paths) except StopIteration: break audio_name = get_name_from_path(audio_path) output_audio_name = os.path.join(sync_map_audio_path_prefix, audio_name) audio_wav_path = os.path.join(tmp_dir, f'{drop_extension(audio_name)}_audio.wav') subprocess.run(['ffmpeg', '-n', '-i', audio_path, audio_wav_path]) audio_mfcc_sequence = np.ascontiguousarray( AudioFileMFCC(audio_wav_path).all_mfcc.T[:, 1:] ) # Keep track to calculate frames timings audio_start_frame = 0 n = len(text_mfcc_sequence) m = len(audio_mfcc_sequence) _, path = c_FastDTWBD(text_mfcc_sequence, audio_mfcc_sequence, skip_penalty, radius=radius) if len(path) == 0: print( f'No match between {text_name} and {audio_name}. ' f'Alignment is terminated. ' f'Adjust skip_penalty or input files.' ) return {} # Project path to the text and audio sequences text_path_frames = path[:,0] audio_path_frames = path[:,1] last_matched_audio_frame = audio_path_frames[-1] # Find first and last matched frames first_matched_text_frame = text_path_frames[0] last_matched_text_frame = text_path_frames[-1] # Map only those fragments that intersect matched frames anchors_boundary_indices = np.searchsorted( anchors, [first_matched_text_frame, last_matched_text_frame] ) map_anchors_from = max(anchors_boundary_indices[0] - 1, 0) map_anchors_to = anchors_boundary_indices[1] anchors_to_map = anchors[map_anchors_from:map_anchors_to] fragments_to_map = fragments[map_anchors_from:map_anchors_to] # Get anchors indicies in the path projection to the text sequence text_path_anchor_indices = np.searchsorted(text_path_frames, anchors_to_map) # Get anchors' frames in audio sequence, calculate their timings anchors_matched_frames = audio_path_frames[text_path_anchor_indices] timings = (np.append(anchors_matched_frames, audio_path_frames[-1]) + audio_start_frame) * 0.040 # Map fragment_ids to timings, update mapping of the current text file fragment_map = { f: { 'audio_file': output_audio_name, 'begin_time': time_to_str(bt), 'end_time': time_to_str(et) } for f, bt, et in zip(fragments_to_map, timings[:-1], timings[1:]) } sync_map[output_text_name].update(fragment_map) # Decide whether to process next file or to align the tail of the current one if map_anchors_to == len(anchors): # Process next text if no fragments are left process_next_text = True else: # Otherwise align tail of the current text process_next_text = False text_mfcc_sequence = text_mfcc_sequence[last_matched_text_frame:] fragments = fragments[map_anchors_to:] anchors = anchors[map_anchors_to:] - last_matched_text_frame if last_matched_audio_frame == m - 1 or not process_next_text: # Process next audio if there are no unmatched audio frames in the tail # or there are more text fragments to map, i.e. # we choose to process next audio if we cannot decide. # This strategy is correct if there are no extra fragments in the end. process_next_audio = True else: # Otherwise align tail of the current audio process_next_audio = False audio_mfcc_sequence = audio_mfcc_sequence[last_matched_audio_frame:] audio_start_frame += last_matched_audio_frame return sync_map