def prepare_for_training(self, target, sample_rate=16000, nested=False): """ Prepare one exemplar for training Returning a new exemplar object with updated file locations and a resampled audio_file """ if nested: af_target_file = os.path.join(target, "sph", basename(self.audio_file.location)) tf_target_file = os.path.join( target, "stm", basename(self.transcript_file.location)) else: af_target_file = os.path.join(target, basename(self.audio_file.location)) tf_target_file = os.path.join( target, basename(self.transcript_file.location)) af = self.audio_file.prepare_for_training( af_target_file, sample_rate=sample_rate, ) tf = self.transcript_file.write(tf_target_file) return exemplar({ "audio_file": af, "transcript_file": tf }) if all([af, tf]) else None
def validate(self): " validate exemplar object by constraining that the filenames before the extension are the same " audio_filename = basename(strip_extension(self.audio_file.location)) transcript_filename = basename( strip_extension(self.transcript_file.location)) # Audio and transcript filename must match # Audio file must not be empty # Transcript file must not be empty valid = (audio_filename == transcript_filename and os.path.getsize(self.audio_file.location) and os.path.getsize(self.transcript_file.location)) return valid
def align_json(ref_txt, json_file, filename=None): """ CLI for forced alignment tools Using a reference txt file and a hypothesis gk json file, this time-aligns the reference txt file and outputs an STM file Input ref_txt, str - reference text file containing ground truth json_file, str - hypothesis gk JSON file filename, str - output STM filename """ ref_tokens = preprocess_txt.parse_transcript(ref_txt) gk_json = preprocess_gk_json.preprocess_transcript(json_file) segments = align(gk_json, ref_tokens) if filename is None: filename = basename(sanitize(strip_extension(ref_txt))) + ".stm" # fix segment filename and speaker for seg in segments: seg.filename = strip_extension(filename) seg.speaker = strip_extension(filename) + "UnknownSpeaker" output = time_aligned_text() output.segments = segments output.write(filename)
def __init__(self, *args, **kwargs): """ Initialize from location and populate list of SPH, WAV, or MP3 audio files and STM files into segments """ for dictionary in args: if isinstance(dictionary, dict): for key in dictionary: setattr(self, key, dictionary[key]) for key in kwargs: setattr(self, key, kwargs[key]) # only if not defined above should we search for exemplars # based on location if not self.exemplars: # instantiate exemplars for this object to override # static class variable self.exemplars = [] audio_extensions_to_try = ["sph", "wav", "mp3"][::-1] self.exemplars += [ exemplar({ "audio_file": audio_file(fl), "transcript_file": time_aligned_text(strip_extension(fl) + ".stm"), }) for audio_extension in audio_extensions_to_try for fl in (get_files(self.location, audio_extension) if self. location else []) if (os.path.exists(strip_extension(fl) + ".stm")) ] # gather all exemplars from /stm and /sph subdirectories if present self.exemplars += [ exemplar({ "audio_file": audio_file(fl), "transcript_file": time_aligned_text(self.location + "/stm/" + basename(strip_extension(fl)) + ".stm"), }) for audio_extension in audio_extensions_to_try for fl in (get_files(self.location + "/sph/", audio_extension) if self.location else []) if (os.path.exists(self.location + "/stm/" + basename(strip_extension(fl)) + ".stm")) ]
def prepare_for_training(self, target=None, nested=False, sample_rate=16000): """ Run validation and audio file preparation steps """ # write corpus back in place if no target target = self.location if target is None else target executor = ThreadPoolExecutor() # process audio files concurrently for speed futures = [ executor.submit( partial(_.audio_file.prepare_for_training, file_name=target + ("/sph/" if nested else "/") + basename(_.audio_file.location), sample_rate=sample_rate)) for _ in self.exemplars ] # trigger conversion and gather results audio_files = [future.result() for future in tqdm(futures)] transcript_files = [ _.transcript_file.write(target + ("/stm/" if nested else "/") + basename(_.transcript_file.location)) for _ in self.exemplars ] new_corpus = corpus({ "location": target, "exemplars": [ exemplar({ "audio_file": af, "transcript_file": tf }) for af, tf in zip(audio_files, transcript_files) ], }) new_corpus.validate() return new_corpus.log()
def validate(self): """ Validates exemplar object by constraining that the filenames before the extension are the same """ audio_filename = basename(strip_extension(self.audio_file.location)) transcript_filename = basename( strip_extension(self.transcript_file.location)) # Audio and transcript filename must match # Audio file must not be empty # Transcript file must not be empty valid = (audio_filename == transcript_filename and os.path.getsize(self.audio_file.location) and os.path.getsize(self.transcript_file.location)) # This returns an integer corresponding to the output of the last condition, not a boolean. # Thats just how `and` works in python return bool(valid)
def extract_xlsx(filename, target_folder): """ For an excel spreadsheet, extract to a text file """ working_excel_data_structure = pd.ExcelFile(filename) raw_name = sanitize(strip_extension(basename(filename))) with open(''.join([target_folder, '/', raw_name, ".txt"]), 'a+') as output_file: for sheet in working_excel_data_structure.sheet_names: dump_sheet(output_file, working_excel_data_structure.parse(sheet).values)