def run(self): with uopen(self.cluster_names, "rt") as f: cluster_names = [l.strip() for l in f] clusters = collections.defaultdict(set) with uopen(self.cluster_map, "rt") as f: t = ET.parse(f) for mi in t.findall(".//map-item"): k = mi.attrib["key"] v = int(mi.attrib["value"].split(".")[-1]) - 1 clusters[cluster_names[v]].add(k) c = corpus.Corpus() c.load(tk.uncached_path(self.corpus_file)) original_segments = {} for s in c.segments(): original_segments[s.fullname()] = s audio = {} transcriptions = {} for cluster_name in clusters: clusters[cluster_name] = list(sorted(clusters[cluster_name])) transcriptions[cluster_name] = " ".join( original_segments[s].orth for s in clusters[cluster_name]) audio[cluster_name] = [ (r.audio, s.start, s.end) for n in clusters[cluster_name] for s, r in [(original_segments[n], original_segments[n].recording)] ] new_c = corpus.Corpus() new_c.name = c.name for cluster_name, audio_files in audio.items(): out_path = os.path.join(self.audio_output.get_path(), cluster_name + ".wav") if os.path.exists(out_path): os.unlink(out_path) with open(f"{cluster_name}.txt", "wt") as f: for af in audio_files: f.write(f"file {af[0]}\ninpoint {af[1]}\n") if not math.isinf(af[2]): f.write(f"outpoint {af[2]}\n") self.sh( f"ffmpeg -loglevel fatal -hide_banner -f concat -safe 0 -i '{cluster_name}.txt' '{out_path}'" ) r = corpus.Recording() r.name = cluster_name r.audio = out_path s = corpus.Segment() s.name = "1" s.start = 0.0 s.end = float("inf") s.orth = transcriptions[cluster_name] r.add_segment(s) new_c.add_recording(r) new_c.dump(self.output_corpus.get_path())
def run(self): c = corpus.Corpus() nc = corpus.Corpus() segment_file_names = [] c.load(tk.uncached_path(self.bliss_corpus)) nc.name = self.corpus_name nc.speakers = c.speakers nc.default_speaker = c.default_speaker nc.speaker_name = c.speaker_name # store index of last segment for r in c.recordings: sr = corpus.Recording() sr.name = r.name sr.segments = r.segments sr.speaker_name = r.speaker_name sr.speakers = r.speakers sr.default_speaker = r.default_speaker sr.audio = r.audio nc.add_recording(sr) for s in sr.segments: segment_file_names.append(nc.name + "/" + sr.name + "/" + s.name) s.start += self.shift nc.dump(str(self.out_shifted_corpus)) with open(str(self.out_segments), "w") as segments_outfile: segments_outfile.writelines(segment_file_names)
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) dictionary = {} segments = None if self.segment_file: with uopen(self.segment_file) as f: segments = set(line.decode().strip() for line in f) for segment in c.segments(): orth = segment.orth.strip() key = segment.fullname() if segments: if (not self.invert_match and key not in segments and segment.name not in segments): continue if self.invert_match and key in segments: continue dictionary[key] = orth dictionary_string = pprint.pformat(dictionary, width=1000) with uopen(self.out_dictionary, "wt") as f: f.write(dictionary_string)
def run(self): segments = [] for seg in self.segment_file_list: with open(tk.uncached_path(seg)) as f: lines = f.readlines() segments += [l.strip() for l in lines] logging.info("There are #{} segments in the segment list.".format( len(segments))) segments = set(segments) c = corpus.Corpus() c.load(tk.uncached_path(self.bliss_corpus)) for rec in c.all_recordings(): if self.invert_match: rec.segments = [ x for x in rec.segments if x.fullname() not in segments and x.name not in segments ] else: rec.segments = [ x for x in rec.segments if x.fullname() in segments or x.name in segments ] c.dump(tk.uncached_path(self.out_corpus))
def run(self): def maybe_to_lower(s): return s if self.case_sensitive else s.lower() lex_path = tk.uncached_path(self.lexicon) open_func = gzip.open if lex_path.endswith(".gz") else open with open_func(lex_path, "rt") as f: lex_root = ET.parse(f) vocabulary = set([ maybe_to_lower(o.text.strip() if o.text else "") for o in lex_root.findall(".//orth") ]) c = corpus.Corpus() c.load(tk.uncached_path(self.corpus)) def not_only_unknowns(corpus, recording, segment): """ :param Corpus corpus: :param Recording recording: :param Segment segment: :return: whether the orth of segment contains at least one known word :rtype: bool """ orth = segment.orth if not orth: return True words = [maybe_to_lower(o) for o in orth.strip().split(" ")] return not all(w not in vocabulary for w in words) c.filter_segments(not_only_unknowns) c.dump(self.out_corpus.get_path())
def run(self): c = corpus.Corpus() c.name = self.name with uopen(self.metadata, "rt") as metadata_file: for line in metadata_file: name, text, processed_text = line.split("|") audio_file_path = os.path.join( self.audio_folder.get_path(), name + ".wav" ) assert os.path.isfile( audio_file_path ), "Audio file %s was not found in provided audio path %s" % ( audio_file_path, self.audio_folder.get_path(), ) recording = corpus.Recording() recording.name = name recording.audio = audio_file_path segment = corpus.Segment() segment.orth = processed_text.strip() segment.name = name wave_info = wave.open(audio_file_path) segment.start = 0 segment.end = wave_info.getnframes() / wave_info.getframerate() wave_info.close() recording.add_segment(segment) c.add_recording(recording) c.dump(self.out_bliss_corpus.get_path())
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) lex = lexicon.Lexicon() lex.load(self.bliss_lexicon.get_path()) # build lookup dict lookup_dict = {} for lemma in lex.lemmata: for orth in lemma.orth: if orth and self.strategy == LexiconStrategy.PICK_FIRST: if len(lemma.phon) > 0: lookup_dict[orth] = lemma.phon[0] word_separation_phon = lookup_dict[self.word_separation_orth] print("using word separation symbold: %s" % word_separation_phon) separator = " %s " % word_separation_phon for segment in c.segments(): try: words = [lookup_dict[w] for w in segment.orth.split(" ")] segment.orth = separator.join(words) except LookupError: raise LookupError( "Out-of-vocabulary word detected, please make sure that there are no OOVs remaining by e.g. applying G2P" ) c.dump(self.out_corpus.get_path())
def run(self): self.corpus_object = corpus.Corpus() self.corpus_object.load(self.bliss_corpus_file.get_path()) recordings = list(self.corpus_object.all_recordings()) print(f"{len(recordings)} recordings detected") print(f"launching {self.n_workers} processes") tasks = [(r, self.out_audio_path, self.target_length, self.file_extension) for r in recordings] with multiprocessing.Pool(processes=self.n_workers) as pool: for i, _ in enumerate(pool.imap_unordered(self.cut_file, tasks)): if i % 100 == 0: logging.info(f"{i} of {len(tasks)} files done")
def run(self): if not os.path.isdir(str(self.out_audio_folder)): self.sh("mkdir '{audio_out}'") c = corpus.Corpus() nc = corpus.Corpus() segment_file_names = [] c.load(tk.uncached_path(self.bliss_corpus)) nc.name = self.corpus_name nc.speakers = c.speakers nc.default_speaker = c.default_speaker nc.speaker_name = c.speaker_name # store index of last segment for r in c.recordings: perturbed_audio_name = "perturbed_" + r.audio.split("/")[-1] self.sh( "ffmpeg -hide_banner -i '%s' -filter:a \"asetrate={base_frequency}*{speed_factor}\" " "-ar {base_frequency} '{audio_out}/%s'" % (r.audio, perturbed_audio_name)) pr = corpus.Recording() pr.name = r.name pr.segments = r.segments pr.speaker_name = r.speaker_name pr.speakers = r.speakers pr.default_speaker = r.default_speaker pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name nc.add_recording(pr) for s in pr.segments: segment_file_names.append(nc.name + "/" + pr.name + "/" + s.name) s.start /= self.speed_factor s.end /= self.speed_factor nc.dump(str(self.out_corpus)) with open(str(self.out_segment_file), "w") as segments_outfile: segments_outfile.writelines(segment_file_names)
def run(self): merged_corpus = corpus.Corpus() merged_corpus.name = self.name for corpus_path in self.bliss_corpora: c = corpus.Corpus() c.load(tk.uncached_path(corpus_path)) if self.merge_strategy == MergeStrategy.SUBCORPORA: merged_corpus.add_subcorpus(c) elif self.merge_strategy == MergeStrategy.FLAT: for rec in c.all_recordings(): merged_corpus.add_recording(rec) merged_corpus.speakers.update(c.speakers) elif self.merge_strategy == MergeStrategy.CONCATENATE: for subcorpus in c.top_level_subcorpora(): merged_corpus.add_subcorpus(subcorpus) for rec in c.top_level_recordings(): merged_corpus.add_recording(rec) for speaker in c.top_level_speakers(): merged_corpus.add_speaker(speaker) else: assert False, "invalid merge strategy" merged_corpus.dump(self.out_merged_corpus.get_path())
def run(self): orth_c = corpus.Corpus() orth_c.load(self.reference_bliss_corpus.get_path()) orths = {} for r in orth_c.all_recordings(): for i in range(len(r.segments)): orth = r.segments[i].orth tag = r.segments[i].fullname() orths[tag] = orth c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) for r in c.all_recordings(): for i in range(len(r.segments)): tag = r.segments[i].fullname() assert tag in orths.keys(), ( "Segment %s not found in reference corpus" % s) orth = orths[tag] r.segments[i].orth = orth c.dump(self.out_corpus.get_path())
def run(self): inf = float("inf") def good_duration(corpus, recording, segment): l = segment.end - segment.start if l == inf: return True else: return l >= self.min_duration and l <= self.max_duration c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) c.filter_segments(good_duration) c.dump(self.out_corpus.get_path())
def run(self): c = libcorpus.Corpus() c.load(self.bliss_corpus.get_path()) words = Counter() for s in c.segments(): words.update(s.orth.strip().split()) counts = [(v, k) for k, v in words.items()] with uopen(self.out_word_counts, "wt") as f: f.write( "\n".join( "%d\t%s" % t for t in sorted(counts, key=lambda t: (-t[0], t[1])) ) )
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) if self.segment_file: with uopen(self.segment_file, "rt") as f: segments_whitelist = set(l.strip() for l in f.readlines() if len(l.strip()) > 0) else: segments_whitelist = None with uopen(self.out_txt.get_path(), "wt") as f: for segment in c.segments(): if (not segments_whitelist) or (segment.fullname() in segments_whitelist): f.write(segment.orth + "\n")
def run(self): transcriptions = collections.defaultdict(list) with open(tk.uncached_path(self.ctm_path), "rt") as f: for line in f: if line.startswith(";;"): continue fields = line.split() if 5 <= len(fields) <= 6: recording = fields[0] start = float(fields[2]) word = fields[4] transcriptions[recording].append((start, word)) for recording, times_and_words in transcriptions.items(): times_and_words.sort() corpus_path = tk.uncached_path(self.bliss_corpus) c = corpus.Corpus() c.load(corpus_path) recordings_to_delete = [] for recording in c.all_recordings(): times = [s[0] for s in transcriptions[recording.name]] words = [s[1] for s in transcriptions[recording.name]] if len(words) == 0 and self.remove_empty_segments: recordings_to_delete = recording continue segments_to_delete = [] for idx, segment in enumerate(recording.segments): left_idx = bisect.bisect_left(times, segment.start) right_idx = bisect.bisect_left(times, segment.end) if left_idx == right_idx and self.remove_empty_segments: segments_to_delete.append(idx) continue segment.orth = " ".join(words[left_idx:right_idx]).replace( "&", "&") for sidx in reversed(segments_to_delete): del recording.segments[sidx] c.dump(self.output_corpus_path.get_path())
def run(self): c = corpus.Corpus() c.load(tk.uncached_path(self.corpus_file)) from multiprocessing import pool p = pool.Pool(self.rqmt["cpu"]) p.map(self._perform_ffmpeg, c.recordings) for r in c.recordings: audio_filename = self._get_output_filename(r) r.audio = os.path.join(self.out_audio_folder.get_path(), audio_filename) if self.recover_duration: c.dump("temp_corpus.xml.gz") else: c.dump(tk.uncached_path(self.out_corpus))
def run_duration_recover(source_corpus, target_corpus): """ iterates over a single segment bliss corpus and uses the soundfile library to get the actuall recording length :param source_corpus: :param target_corpus: :return: """ c = corpus.Corpus() c.load(source_corpus) for r in c.all_recordings(): assert len(r.segments) == 1, "needs to be a single segment recording" old_duration = r.segments[0].end data, sample_rate = soundfile.read(open(r.audio, "rb")) new_duration = len(data) / sample_rate print("%s: %f vs. %f" % (r.segments[0].name, old_duration, new_duration)) r.segments[0].end = new_duration c.dump(target_corpus)
def run_recover_duration(self): """ Open all files with "soundfile" and extract the length information :return: """ import soundfile c = corpus.Corpus() c.load("temp_corpus.xml.gz") for r in c.all_recordings(): assert len( r.segments) == 1, "needs to be a single segment recording" old_duration = r.segments[0].end data, sample_rate = soundfile.read(open(r.audio, "rb")) new_duration = len(data) / sample_rate logging.info("%s: adjusted from %f to %f seconds" % (r.segments[0].name, old_duration, new_duration)) r.segments[0].end = new_duration c.dump(self.out_corpus.get_path())
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) if self.segment_file: with uopen(self.segment_file.get_path(), "rt") as f: segments_whitelist = set(l.strip() for l in f.readlines() if len(l.strip()) > 0) segment_iterator = filter( lambda s: s.fullname() in segments_whitelist, c.segments()) else: segment_iterator = c.segments() with uopen(self.text_file, "rt") as f: for segment, line in itertools.zip_longest(segment_iterator, f): assert (segment is not None ), "there were more text file lines than segments" assert line is not None, "there were less text file lines than segments" assert len(line) > 0 segment.orth = line.strip() c.dump(self.out_corpus.get_path())
def run(self): self._get_speakers() self._get_transcripts() c = corpus.Corpus() c.name = os.path.basename(self.corpus_folder.get_path()) used_speaker_ids = set() # store which speakers are used for transcript in self._transcripts: name = "{0}-{1}-{2:04d}".format(transcript["speaker_id"], transcript["chapter"], transcript["segment"]) recording = corpus.Recording() recording.name = name recording.speaker_name = transcript["speaker_id"] recording.audio = "{}/{}.flac".format(transcript["path"], name) used_speaker_ids.add(transcript["speaker_id"]) segment = corpus.Segment() segment.name = name segment.start = 0 segment.end = float("inf") segment.orth = transcript["orth"].strip() recording.segments.append(segment) c.recordings.append(recording) for speaker_id, speaker_info in sorted(self._speakers.items()): if speaker_id not in used_speaker_ids: continue speaker = corpus.Speaker() speaker.name = speaker_id speaker.attribs[ "gender"] = "male" if speaker_info[0] == "M" else "female" c.add_speaker(speaker) c.dump(self.out_corpus.get_path())
def run(self): tag_map = {} c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) all_tags = [("d%d" % i, "default%d" % i, "all other segments of category %d" % i) for i in range(len(self.tag_mapping) + 1)] for segment in c.segments(): tag_map[segment.fullname()] = [ "d%d" % i for i in range(len(self.tag_mapping) + 1) ] for i, (tag, segments) in enumerate(self.tag_mapping): all_tags.append(tag) for file in segments.values(): for segment in uopen(file): if segment.rstrip() in tag_map: tag_map[segment.rstrip()][i] = tag[0] with uopen(self.out_stm_path, "wt") as out: for segment in c.segments(): speaker_name = (segment.speaker().name if segment.speaker() is not None else segment.recording.name) segment_track = segment.track + 1 if segment.track else 1 out.write("%s %d %s %5.2f %5.2f <%s> %s\n" % ( segment.recording.name, segment_track, speaker_name, segment.start, segment.end, ",".join(tag_map[segment.fullname()]), segment.orth, )) for tag in all_tags: out.write(';; LABEL "%s" "%s" "%s"\n' % tag)
def run(self): c = corpus.Corpus() c.load(tk.uncached_path(self.bliss_corpus)) for recording in c.all_recordings(): recording.audio = gs.file_caching(recording.audio) c.dump(tk.uncached_path(self.cached_corpus))
def run(self): c = corpus.Corpus() c.name = "switchboard-1" rec_to_segs = self._get_rec_to_segs_map() rec_to_speaker = {} with uopen(self.speakers_list_file) as f: for line in f: l = line.strip().split() assert len(l) == 3 assert (l[2] not in rec_to_speaker ), "duplicate recording name: {}?".format(l[2]) assert l[1] in ["F", "M"] # "sw0" prefix is added to match recording names rec_to_speaker["sw0" + l[2]] = { "speaker_id": l[0], "gender": { "M": "male", "F": "female" }.get(l[1]), } # assume unique speaker for each recording with no speaker info unk_spk_id = 1 for rec in sorted(rec_to_segs.keys()): if rec not in rec_to_speaker: rec_to_speaker[rec] = { "speaker_id": "speaker#" + str(unk_spk_id) } unk_spk_id += 1 for rec_name, segs in sorted(rec_to_segs.items()): recording = corpus.Recording() recording.name = rec_name recording.audio = os.path.join(self.audio_dir.get_path(), rec_name + ".wav") assert os.path.exists( recording.audio), "recording {} does not exist?".format( recording.audio) assert ( rec_name in rec_to_speaker ), "recording {} does not have speaker id?".format(rec_name) rec_speaker_id = rec_to_speaker[rec_name]["speaker_id"] for seg in segs: segment = corpus.Segment() segment.name = seg[0] segment.start = float(seg[1]) segment.end = float(seg[2]) segment.speaker_name = rec_speaker_id segment.orth = self._filter_orth(seg[3]) if len(segment.orth) == 0: continue recording.segments.append(segment) c.recordings.append(recording) # add speakers to corpus for speaker_info in rec_to_speaker.values(): speaker = corpus.Speaker() speaker.name = speaker_info["speaker_id"] if speaker_info.get("gender", None): speaker.attribs["gender"] = speaker_info["gender"] c.add_speaker(speaker) c.dump(self.out_corpus.get_path())
def cut_audio(self): c = corpus.Corpus() c.load(tk.uncached_path(self.bliss_corpus)) groups_dict = pickle.load(open("groups.pkl", "rb")) empty_recordings = [] ffmpeg_commands = [] for recording in c.all_recordings(): assert len(recording.segments) == 1 segment = recording.segments[0] in_file = recording.audio target_file = "_".join(segment.fullname().split("/")) if self.output_format: target_file += "." + self.output_format else: target_file += os.path.splitext(in_file)[1] target_file = os.path.join(tk.uncached_path(self.out_audio_folder), target_file) groups = groups_dict[segment.fullname()] if len(groups) == 0: empty_recordings.append(recording) continue ffmpeg_command = ["ffmpeg", "-y", "-i", in_file, "-filter_complex"] split_orth = segment.orth.split(" _ ") filter_commands = [] for i, new_group in enumerate(groups[0]): command = "[0]atrim=%.3f:%.3f[g%i]" % (new_group[0], new_group[1], i) filter_commands.append(command) split_orth = split_orth[0].split(" ") count = 0 if (self.silence_symbol != None): for i, grp in enumerate(groups[1]): word_id = grp[0] + count duration = ( int(grp[1]) / (self.silence_symbol_duration / self.window_shift)) if (duration - math.floor(duration) < 0.5): duration = math.floor(duration) else: duration = math.ceil(duration) if duration != 0: split_orth.insert(word_id, self.silence_symbol * duration) count = count + 1 segment.orth = " ".join(split_orth) filter_command = ";".join(filter_commands) filter_command += ";" + "".join([ "[g%i]" % i for i in range(len(groups[0])) ]) + "concat=n=%i:v=0:a=1[out]" % (len(groups[0])) ffmpeg_command += [filter_command, "-map", "[out]", target_file] print(" ".join(ffmpeg_command)) ffmpeg_commands.append(ffmpeg_command) recording.audio = target_file def delete_recordings(c, recordings): for subcorpus in c.subcorpora: delete_recordings(subcorpus, recordings) for r in recordings: print("tried to delete empty recording %s" % r.name) c.recordings.remove(r) delete_recordings(c, empty_recordings) c.dump("temp_corpus.xml.gz") p = multiprocessing.Pool(processes=4) p.map(self.run_subprocess, ffmpeg_commands)
def run(self): id = os.path.basename(self.job_id()) if not os.path.isdir(f"/dev/shm/{id}"): os.mkdir(f"/dev/shm/{id}") c = corpus.Corpus() nc = corpus.Corpus() segment_file_names = [] c.load(tk.uncached_path(self.bliss_corpus)) nc.name = self.corpus_name nc.speakers = c.speakers nc.default_speaker = c.default_speaker nc.speaker_name = c.speaker_name logging.info("Random seed used: {}".format(self.seed)) rng = random.Random(self.seed) # store index of last segment for r in c.recordings: max_seg_end = 0 for s in r.segments: if s.end > max_seg_end: max_seg_end = s.end r.max_seg_end = max_seg_end # select noise files for each recording for i, r in enumerate(c.recordings): audio_name = r.audio target_length = r.max_seg_end reverbed_audio_name = "noised_" + audio_name.split("/")[-1] # remove any possibly existing temporary recordings (otherwise ffmpeg will ask for override) for p in glob.iglob(f"/dev/shm/{id}/tmp_concat_*.wav"): os.unlink(p) for n in range(self.n_noise_tracks): noise_length = 0 noise_audios = [] while noise_length < target_length: random_index = rng.randint(0, len(c.recordings) - 1) while random_index == i: random_index = random.randint(0, len(c.recordings) - 1) noise_audios.append(c.recordings[random_index]) noise_length += c.recordings[random_index].max_seg_end # create temp noise file temp_noise_track_file = "/dev/shm/{id}/tmp_concat_%i.wav" % n self.sh( "ffmpeg -hide_banner -loglevel panic -f concat -safe 0 -i <(%s) '%s'" % ( " ".join([ 'echo "file %s";' % f.audio for f in noise_audios ]), temp_noise_track_file, ), except_return_codes=(1, ), ) if self.n_noise_tracks == 1: self.sh( "ffmpeg -hide_banner -i '%s' -i '/dev/shm/{id}/tmp_concat_0.wav' " "-filter_complex '[1]volume=-{snr}dB[a];[0][a]amix=duration=first[out]' " "-map '[out]' '{audio_out}/%s'" % (audio_name, reverbed_audio_name)) else: ffmpeg_head = "ffmpeg -hide_banner -i '%s' " % audio_name noise_inputs = " ".join([ "-i '/dev/shm/%s/tmp_concat_%i.wav'" % (id, i) for i in range(self.n_noise_tracks) ]) filter_head = ' -filter_complex "' volume_reduction = (";".join([ "[%i]volume=-%idB[a%i]" % (i + 1, self.snr, i + 1) for i in range(self.n_noise_tracks) ]) + ";") mixer = ("[0]" + "".join( ["[a%i]" % i for i in range(1, self.n_noise_tracks + 1)]) + "amix=duration=first:inputs=%i[out]" % (self.n_noise_tracks + 1)) filter_tail = '" -map "[out]" "{audio_out}/%s"' % reverbed_audio_name command = (ffmpeg_head + noise_inputs + filter_head + volume_reduction + mixer + filter_tail) self.sh(command) nr = corpus.Recording() nr.name = r.name nr.segments = r.segments nr.speaker_name = r.speaker_name nr.default_speaker = r.default_speaker nr.speakers = r.speakers nr.audio = str(self.out_audio_folder) + "/" + reverbed_audio_name nc.add_recording(nr) for s in nr.segments: segment_file_names.append(nc.name + "/" + nr.name + "/" + s.name + "\n") nc.dump(self.out_corpus.get_path()) with open(tk.uncached_path(self.out_segment_file), "w") as segments_outfile: segments_outfile.writelines(segment_file_names) shutil.rmtree(f"/dev/shm/{id}")
def run(self): import h5py temp_dir = tempfile.TemporaryDirectory(prefix="hdf_reconstruction_") ref_linear_data = h5py.File(self.hdf_file.get_path(), 'r') rl_inputs = ref_linear_data['inputs'] rl_tags = ref_linear_data['seqTags'] rl_lengths = ref_linear_data['seqLengths'] n_fft = rl_inputs[0].shape[0]*2 print("N_FFT from HDF: % i" % n_fft) converter = PhaseReconstructor(out_folder=temp_dir.name, backend=self.backend, sample_rate=self.sample_rate, window_shift=self.window_shift, window_size=self.window_size, n_fft=n_fft, iterations=self.iterations, preemphasis=self.preemphasis, file_format=self.file_format, corpus_format="bliss") corpus_path = os.path.join(self.out_folder.get_path(), "corpus.xml.gz") corpus = bliss_corpus.Corpus() # H5py has issues with multithreaded loading, so buffer 512 spectograms # single threaded and then distribute to the workers for conversion p = multiprocessing.Pool(self.rqmt['cpu']) loaded_spectograms = [] offset = 0 for tag, length in zip(rl_tags, rl_lengths): tag = tag if isinstance(tag, str) else tag.decode() loaded_spectograms.append((tag, np.asarray(rl_inputs[offset:offset + length[0]]).T)) offset += length[0] if len(loaded_spectograms) > 512: recordings = p.map(converter.convert, loaded_spectograms) for recording in recordings: corpus.add_recording(recording) # force gc for minimal memory requirement del loaded_spectograms gc.collect() loaded_spectograms = [] # process rest in the buffer if len(loaded_spectograms) > 0: recordings = p.map(converter.convert, loaded_spectograms) # put all recordings to the corpus for recording in recordings: corpus.add_recording(recording) corpus.name = tag.split("/")[0] corpus.dump("corpus.xml") replacement_string = "s:%s:%s:g" % (temp_dir.name, self.out_folder.get_path()) subprocess.call(["sed", "-i", replacement_string, "corpus.xml"]) subprocess.call(["gzip", "corpus.xml"]) shutil.move("corpus.xml.gz", self.out_corpus.get_path()) for path in glob.glob(temp_dir.name + "/*"): shutil.move(path, self.out_folder.get_path())
def run(self): c = corpus.Corpus() c.load(self.bliss_corpus.get_path()) assert ( len(c.subcorpora) == 0 ), "CompressCorpus is not working for corpus files containing subcorpora" # for each recording, extract duration total_duration = self.add_duration_to_recordings(c) # print useful information logging.info(f"corpus name {c.name}") logging.info(f"number of recordings: {len(c.recordings)}") logging.info(f"total duration: {total_duration} sec") # determine split split_duration = total_duration / float(self.num_splits) logging.info(f"split duration: {split_duration} sec") # create new compressed corpus file cc = corpus.Corpus() cc.name = c.name cc.speaker_name = c.speaker_name cc.speakers = c.speakers cc.default_speaker = c.default_speaker sm = corpus.SegmentMap() # temporary store of recordings split_recordings = [] current_duration = 0 current_split_index = 0 # segment count for verification segment_count = 0 for i, recording in enumerate(c.recordings): # append recording and its duration to the l split_recordings.append(recording) current_duration += recording.duration # now we have all recordings in the duration for a single file or it is the last recording if current_duration > split_duration or i + 1 == len(c.recordings): new_recording_element = corpus.Recording() split_name = "split_%i" % current_split_index logging.info( f"storing split {split_name} with duration {current_duration}" ) new_recording_element.name = split_name output_path = os.path.join(self.audio_folder.get_path(), f"{split_name}.{self.format}") new_recording_element.audio = output_path current_timestamp = 0 # store all audio paths that are to be concatenated for a split ffmpeg_inputs = [] for split_recording in split_recordings: recording_name = split_recording.name for j, segment in enumerate(split_recording.segments): # update the segment times based on the current time segment.start = float( segment.start) + current_timestamp # segment ends can be inf, use the duration of the recording in that case if segment.end == "inf": segment.end = split_recording.duration + current_timestamp else: segment.end = float( segment.end) + current_timestamp # add segment keymap entry sm_entry = corpus.SegmentMapItem() # add original name to key sm_entry.key = "/".join( [c.name, recording_name, segment.name]) # if a segment has no name, use a 1-based index # of the form corpus_name/split_i/original_recording_name#segment_j # otherwise create entries in the form corpus_name/split_i/original_recording_name#segment_name if segment.name is None: segment.name = recording_name + "#" + str(j + 1) else: segment.name = recording_name + "#" + segment.name # add new name as segment map value sm_entry.value = "/".join( [c.name, split_name, segment.name]) sm.map_entries.append(sm_entry) new_recording_element.segments.append(segment) segment_count += 1 # update the time stamp with the recording length and add to ffmpeg merge list current_timestamp += split_recording.duration ffmpeg_inputs.append(split_recording.audio) # run ffmpeg and add the new recording self.run_ffmpeg(ffmpeg_inputs, output_path) cc.add_recording(new_recording_element) # reset variables current_split_index += 1 split_recordings = [] current_duration = 0 logging.info(f"segment count: {segment_count}") cc.dump(tk.uncached_path(self.compressed_corpus)) sm.dump(tk.uncached_path(self.segment_map))