def extract_alignment(self, alignment_path): if alignment_path.endswith(".bundle"): files = open(alignment_path, "rt") for cache in files: sprint_cache = FileArchive(cache.strip()) sprint_cache.setAllophones( tk.uncached_path(self.allophone_file)) keys = [ str(s) for s in sprint_cache.ft if not str(s).endswith(".attribs") ] for key in keys: # only exctract time and mix, the HMM state is not needed alignment = [[a[0], a[1], sprint_cache.allophones[a[1]]] for a in sprint_cache.read(key, 'align')] yield (key, alignment) else: sprint_cache = FileArchive(alignment_path) sprint_cache.setAllophones(tk.uncached_path(self.allophone_file)) keys = [ str(s) for s in sprint_cache.ft if not str(s).endswith(".attribs") ] for key in keys: # only exctract time and mix, the HMM state is not needed alignment = [[a[0], a[1], sprint_cache.allophones[a[1]]] for a in sprint_cache.read(key, 'align')] yield (key, alignment)
def run(self): c = corpus.Corpus() nc = corpus.Corpus() c.load(tk.uncached_path(self.corpus_file)) nc.name = c.name nc.speakers = c.speakers nc.default_speaker = c.default_speaker nc.speaker_name = c.speaker_name # store index of last segment for r in c.recordings: nr = corpus.Recording() nr.name = r.name nr.segments = r.segments nr.speaker_name = r.speaker_name nr.speakers = r.speakers nr.default_speaker = r.default_speaker audio_name = r.audio.split("/")[-1] if self.output_format is not None: name, ext = os.path.splitext(audio_name) audio_name = name + "." + self.output_format nr.audio = os.path.join(tk.uncached_path(self.audio_folder), audio_name) nc.add_recording(nr) from multiprocessing import pool p = pool.Pool(4) p.map(self.perform_ffmpeg, c.recordings) nc.dump(tk.uncached_path(self.out))
def create_files(self): # returnn shutil.copy(tk.uncached_path(self.returnn_config_file_in), tk.uncached_path(self.returnn_config_file)) parameter_list = self.get_parameter_list() with open('rnn.sh', 'wt') as f: f.write('#!/usr/bin/env bash\n%s' % ' '.join([tk.uncached_path(self.returnn_python_exe), os.path.join(tk.uncached_path(self.returnn_root), 'rnn.py'), self.returnn_config_file.get_path()] + parameter_list)) os.chmod('rnn.sh', stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
def run(self): import soundfile c = corpus.Corpus() c.load(tk.uncached_path(self.bliss_corpus)) for r in c.all_recordings(): assert len( r.segments) == 1, "needs to be a single segment recording" old_duration = r.segments[0].end data, sample_rate = soundfile.read(open(r.audio, "rb")) new_duration = len(data) / sample_rate print("%s: %f vs. %f" % (r.segments[0].name, old_duration, new_duration)) r.segments[0].end = new_duration c.dump(tk.uncached_path(self.out))
def run(self): d = eval(open(self.search_py_output.get_path(), "r").read()) assert isinstance(d, dict) # seq_tag -> bpe string assert not os.path.exists(self.out_word_search_results.get_path()) with open(tk.uncached_path(self.out_word_search_results), "w") as out: out.write("{\n") for seq_tag, txt in sorted(d.items()): if "#" in seq_tag: tag_split = seq_tag.split("/") recording_name, segment_name = tag_split[2].split("#") seq_tag = tag_split[ 0] + "/" + recording_name + "/" + segment_name out.write( "%r: %r,\n" % (seq_tag, txt.replace(" ", "").replace("▁", " ").strip())) out.write("}\n")
def perform_ffmpeg(self, r): audio_name = r.audio.split("/")[-1] if self.output_format is not None: name, ext = os.path.splitext(audio_name) audio_name = name + "." + self.output_format target = tk.uncached_path(self.audio_folder) + "/" + audio_name seconds = None if not os.path.exists(target): result = self.sh("%s -hide_banner -y -i %s %s {audio_folder}/%s" % (self.ffmpeg_binary, r.audio, self.ffmpeg_option_string, audio_name), include_stderr=True) else: print("found %s" % target) return seconds
def get_parameter_list(self): parameter_list = [] for k, v in sorted(self.parameter_dict.items()): if isinstance(v, tk.Variable): v = str(v.get()) elif isinstance(v, tk.Path): v = tk.uncached_path(v) elif isinstance(v, list): v = "\"%s\"" % str(v).replace(" ", "") else: v = str(v) if v.startswith("-"): v = "-- " + v parameter_list.append("++%s" % k) parameter_list.append(v) return parameter_list
def recover_duration(self): run_duration_recover("temp_corpus.xml.gz", tk.uncached_path(self.out_corpus))
def cut_audio(self): c = corpus.Corpus() c.load(tk.uncached_path(self.bliss_corpus)) groups_dict = pickle.load(open("groups.pkl", "rb")) empty_recordings = [] ffmpeg_commands = [] for recording in c.all_recordings(): assert len(recording.segments) == 1 segment = recording.segments[0] in_file = recording.audio target_file = "_".join(segment.fullname().split("/")) if self.output_format: target_file += "." + self.output_format else: target_file += os.path.splitext(in_file)[1] target_file = os.path.join(tk.uncached_path(self.out_audio_folder), target_file) groups = groups_dict[segment.fullname()] if len(groups) == 0: empty_recordings.append(recording) continue ffmpeg_command = ["ffmpeg", "-y", "-i", in_file, "-filter_complex"] split_orth = segment.orth.split(" _ ") filter_commands = [] for i, new_group in enumerate(groups[0]): command = "[0]atrim=%.3f:%.3f[g%i]" % (new_group[0], new_group[1], i) filter_commands.append(command) split_orth = split_orth[0].split(" ") count = 0 if (self.silence_symbol != None): for i, grp in enumerate(groups[1]): word_id = grp[0] + count duration = ( int(grp[1]) / (self.silence_symbol_duration / self.window_shift)) if (duration - math.floor(duration) < 0.5): duration = math.floor(duration) else: duration = math.ceil(duration) if duration != 0: split_orth.insert(word_id, self.silence_symbol * duration) count = count + 1 segment.orth = " ".join(split_orth) filter_command = ";".join(filter_commands) filter_command += ";" + "".join([ "[g%i]" % i for i in range(len(groups[0])) ]) + "concat=n=%i:v=0:a=1[out]" % (len(groups[0])) ffmpeg_command += [filter_command, "-map", "[out]", target_file] print(" ".join(ffmpeg_command)) ffmpeg_commands.append(ffmpeg_command) recording.audio = target_file def delete_recordings(c, recordings): for subcorpus in c.subcorpora: delete_recordings(subcorpus, recordings) for r in recordings: print("tried to delete empty recording %s" % r.name) c.recordings.remove(r) delete_recordings(c, empty_recordings) c.dump("temp_corpus.xml.gz") p = multiprocessing.Pool(processes=4) p.map(self.run_subprocess, ffmpeg_commands)
def extract_silence(self): """ TODO: fix the high memory consumption :return: """ alignment_path = tk.uncached_path(self.alignment_cache) groups_dict = {} for key, cache in self.extract_alignment(alignment_path): length = len(cache) indices = numpy.asarray( [numpy.minimum(1, entry[1]) for entry in cache]) word_tokens = [] for i in range(length): word_tokens.append(cache[i][2].split("}")[-1]) words = 0 silence_duration = 0 in_word = False in_silence = False silence_word_positions = [] groups = [] in_group = bool(indices[0]) group_start = 0 group_end = 0 for i, (speech, word_token) in enumerate(zip(indices, word_tokens)): # dealing with word tokens assert word_token in ['', '@i', '@f', '@i@f'] if word_token == "@i" and in_word == False and in_silence == True: in_word = True in_silence = False words += 1 if word_token == "@i" and in_word == False: in_word = True if word_token == "@i" and silence_duration > 0: # clip the silence duration to the maximum number of frames we allow, e.g. 500ms pause / 10ms shift = 50 frames # after word 12 there are 80 frames silence -> clip to 50 frames silence_word_positions.append( (words, numpy.minimum(silence_duration, self.pause_duration / self.window_shift))) silence_duration = 0 if in_word and word_token == "@f": words += 1 in_word = False if word_token == "@i@f" and speech == 0: silence_duration += 1 if word_token == "@i@f" and speech == 1: in_silence = True # dealing with speech/silence if not in_group and speech == 1: if (group_start == 0 and group_end == 0): group_start = i * self.window_shift in_group = True elif (i * self.window_shift - group_end) > self.pause_duration: group_end = group_end + (self.pause_duration) / 2 groups.append((group_start, group_end)) group_start = i * self.window_shift - ( self.pause_duration) / 2 in_group = True else: in_group = True if in_group and speech == 0: group_end = i * self.window_shift in_group = False if (group_start < group_end): groups.append((group_start, group_end)) if (group_start > group_end): group_end = group_start + self.window_shift groups.append((group_start, group_end)) # store groups and silence, drop the first silence word position # as we never have silence in the beginning groups_dict[key] = [groups, silence_word_positions[1:]] pickle.dump(groups_dict, open("groups.pkl", "wb"))