def clean(content): sentence = jiwer.RemoveKaldiNonWords()(content) sentence = sentence.replace("^", "ć") sentence = jiwer.SubstituteRegexes({ r"{": r"š", r"`": r"ž", r"}": r"đ", r"~": r"č", r"#": r"dž" })(sentence) sentence = jiwer.RemoveMultipleSpaces()(sentence) return sentence
def get_paired_text_corrected(batch): sentences = list() for filename in batch: with open(path_txt + filename, "r") as f: sentence = jiwer.RemoveKaldiNonWords()(f.read()) sentence = sentence.replace("^", "ć") sentences.append(jiwer.RemoveMultipleSpaces()(sentence)) sentences = jiwer.SubstituteRegexes({ r"{": r"š", r"`": r"ž", r"}": r"đ", r"~": r"č", r"#": r"dž" })(sentences) return sentences
def compute_perc_script_missing(original_script, transcript, language): ''' Check how much of original_script is missing in transcript. Clean and remove stopwords ''' # print(original_script) # print(transcript) cleaning = jiwer.Compose([ jiwer.SubstituteRegexes({"¡": "", "¿":"", "á": "a", "é": "e", "í": "i", "ó": "o","ú": "u"}), jiwer.SubstituteWords({ "tardes": "dias", "noches": "dias", " uno ": " 1 ", " dos ": " 2 ", " tres ": " 3 ", " cuatro ": " 4 ", " cinco ": " 5 ", " seis ": " 6 ", " siete ": " 7 ", " ocho ": " 8 ", " nueve ": " 9 "}), jiwer.RemovePunctuation(), jiwer.ToLowerCase(), jiwer.SentencesToListOfWords(word_delimiter=" "), jiwer.RemoveEmptyStrings() ]) #Remove anything between ${variable} from original_script original_script_transformed = re.sub(r'\${.*?\}','',original_script) # print(original_script_transformed) #Clean both original_script_transformed = cleaning(original_script_transformed) transcript_transformed = cleaning(transcript) # print(original_script_transformed) #Remove stopwords from original_script original_script_transformed_no_stopwords = remove_stopwords(original_script_transformed, language) if len(original_script_transformed_no_stopwords) != 0: #Sometimes removing stopwords removes all words from script original_script_transformed = original_script_transformed_no_stopwords #Lemmatize transcript stemmer = get_stemmer(language) transcript_transformed_stem = [stemmer.stem(word) for word in transcript_transformed] #Get words form original_script_transformed whose stem is not in transcript_transformed_stem words_missing = [word for word in original_script_transformed if stemmer.stem(word) not in transcript_transformed_stem] return len(words_missing)/len(original_script_transformed), words_missing
end = next_sub.end.hours * 3600 + next_sub.end.minutes * 60 + next_sub.end.seconds + next_sub.end.milliseconds / 1000 ground_truth = ground_truth + " " + next_sub.text_without_tags hypothesis = kd.query_text(start, end) else: break kd.mark_words(start, end) transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.RemoveMultipleSpaces(), jiwer.RemoveWhiteSpace(replace_by_space=True), jiwer.SentencesToListOfWords(), jiwer.RemovePunctuation(), jiwer.RemoveEmptyStrings(), jiwer.SubstituteRegexes({r"ё": r"е"}) ]) gt = transformation([ground_truth]) hp = transformation([hypothesis]) gt, hp = replace_pairs(gt, hp) hp, gt = replace_pairs(hp, gt) wer(gt, hp) r = jiwer.compute_measures( gt, hp ) print(f"\nWER:{r['wer'] * 100:.3f}\t\tS:{r['S']} D:{r['D']} H:{r['H']} I:{r['I']}\n")
import jiwer import textwrap import regex # jiwer.RemovePunctuation removes string.punctuation not all Unicode punctuation class RemovePunctuation(jiwer.AbstractTransform): def process_string(self, s: str): return regex.sub(r"\p{P}", "", s) # remove some differences that we don't care about for comparisons transform = jiwer.Compose([ jiwer.ToLowerCase(), RemovePunctuation(), jiwer.SubstituteRegexes( {r"\b(uh|um|ah|hi|alright|all right|well|kind of)\b": ""}), jiwer.SubstituteWords({ "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10", "plus": "+", "minus": "-", "check out": "checkout", "hard point": "hardpoint"}), jiwer.RemoveMultipleSpaces(), jiwer.Strip(), jiwer.SentencesToListOfWords(), jiwer.RemoveEmptyStrings() ]) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("json_path") parser.add_argument("--verbose", action="store_true")
break tstart = next_sub.start.hours * 3600 + next_sub.start.minutes * 60 + next_sub.start.seconds + next_sub.start.milliseconds / 1000 if (tstart - end) > 0.5: srt.push(next_sub) break end = next_sub.end.hours * 3600 + next_sub.end.minutes * 60 + next_sub.end.seconds + next_sub.end.milliseconds / 1000 ground_truth = ground_truth + " " + next_sub.text_without_tags hypothesis = kd.query_text(start, end) else: break kd.mark_words(start, end) transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.SubstituteRegexes({r"…|–|«|»": r""}), jiwer.RemoveMultipleSpaces(), jiwer.RemoveWhiteSpace(replace_by_space=True), jiwer.SentencesToListOfWords(), jiwer.RemovePunctuation(), jiwer.RemoveEmptyStrings(), jiwer.SubstituteRegexes({r"ё": r"е"}), ]) gt = transformation([ground_truth]) hp = transformation([hypothesis]) gt, hp = replace_pairs(gt, hp) hp, gt = replace_pairs(hp, gt) wer(gt, hp)