def test_deepspeech(): model_path = os.path.join("test_samples", "english.pbmm") transcription_model = DeepSpeech(model_path) audio_path = os.path.join("test_samples", "audio.wav") transcription = transcription_model.transcribe(audio_path) assert similarity(TEXT, transcription) > MIN_SYNTHESIS_SCORE
def sim_align(self, a, start, end): source = self.text[start:end] words = source.split(" ") best = "" best_score = 0 for i in range(len(words)): for j in range(i, len(words)): t = " ".join(words[i:j]) score = similarity(a, t) if score > best_score: best = t best_score = score start = self.text.index(best) end = start + len(best) return start, end, best_score
def test_hifigan_synthesis(): hifigan_model_path = os.path.join("test_samples", "hifigan.pt") hifigan_config_path = os.path.join("test_samples", "config.json") audio_path = "synthesized_audio.wav" transcription_model = Silero() hifigan = Hifigan(hifigan_model_path, hifigan_config_path) text = "the monkeys live" synthesize( model=FakeModelForSynthesis(), text=text, graph_path=None, audio_path=audio_path, vocoder=hifigan, ) assert os.path.isfile(audio_path) assert similarity( text, transcription_model.transcribe(audio_path)) > MIN_SYNTHESIS_SCORE os.remove(audio_path)
def generate_clips_from_subtitles( audio_path, subs, transcription_model, output_path, logging=logging, min_length=MIN_LENGTH, max_length=MAX_LENGTH, min_confidence=MIN_CONFIDENCE, ): """ Generates clips from subtitles. Parameters ---------- audio_path : str Path to audio file (must have been converted using convert_audio) subs : list List of pysrt subtitle objects transcription_model : TranscriptionModel Transcription model output_path : str Path to save audio clips to logging : logging (optional) Logging object to write logs to min_length : float (optional) Minimum duration of a clip in seconds max_length : float (optional) Maximum duration of a clip in seconds min_confidence : float (optional) Minimum confidence score to generate a clip for Returns ------- (list, list) List of clips and clip lengths in seconds """ logging.info("Loading subtitles...") total = len(subs) logging.info(f"{total} subtitle lines detected...") result_fragments = [] unlabelled_fragments = [] clip_lengths = [] for i, sub in enumerate(subs): duration = sub.duration.seconds + (sub.duration.milliseconds / 1000) if duration >= min_length and duration <= max_length: start = sub.start.to_time().strftime("%H:%M:%S.%f") end = sub.end.to_time().strftime("%H:%M:%S.%f") filename = cut_audio(audio_path, start, end, output_path) clip_path = os.path.join(output_path, filename) try: transcript = transcription_model.transcribe(clip_path) except: logging.info(f"Could not transcribe {clip_path}") transcript = None if transcript: text = sub.text.strip().replace("\n", " ") score = similarity(transcript, text) if score >= min_confidence: result_fragments.append( { "name": filename, "start": start, "end": end, "duration": duration, "transcript": transcript, "text": text, "score": score, } ) clip_lengths.append(duration) else: unlabelled_fragments.append(filename) logging.info(f"Progress - {i+1}/{total}") return result_fragments, unlabelled_fragments, clip_lengths
def test_silero(): transcription_model = Silero() audio_path = os.path.join("test_samples", "audio.wav") transcription = transcription_model.transcribe(audio_path) assert similarity(TEXT, transcription) > MIN_SYNTHESIS_SCORE
def test_similarity(): assert similarity("abc", "def") == 0 assert similarity("abc", "abc") == 1