def perplexity(self, transcription): ''' Compute perplexity of a transcription. Args: <transcription>: file path or exkaldi Transcription object. Return: an named tuple: PPL(probs,sentences,words,ppl,ppl1). ''' declare.is_potential_transcription("transcription", transcription) if isinstance(transcription, str): transcription = load_transcription(transcription) prob = self.score(transcription) sentences = len(prob) words = transcription.sentence_length().sum() sumProb = prob.sum() ppl = 10**(-sumProb / (sentences + words)) ppl1 = 10**(-sumProb / (words)) return namedtuple("PPL", ["prob", "sentences", "words", "ppl", "ppl1"])(round( sumProb, 2), sentences, words, round(ppl, 2), round(ppl1, 2))
def score(self, transcription, bos=True, eos=True): ''' Score a transcription. Args: <transcription>: file path or exkaldi Transcription object. <bos>: If True, add <s> to the head. <eos>: If True, add </s> to the tail. Return: an exkaldi Metric object. ''' declare.is_potential_transcription("transcription", transcription) if isinstance(transcription, str): transcription = load_transcription(transcription) scores = Metric(name=f"LMscore({transcription.name})") for uttID, txt in transcription.items(): scores[uttID] = self.score_sentence(txt, bos, eos) return scores
def wer(ref, hyp, ignore=None, mode='all'): ''' Compute WER (word error rate) between <ref> and <hyp>. Args: <ref>,<hyp>: exkaldi transcription object or file path. <ignore>: ignore a symbol. <mode>: "all" or "present". Return: a namedtuple of score information. ''' declare.is_potential_transcription("ref", ref) declare.is_potential_transcription("hyp", hyp) declare.is_instances("mode", mode, ['all', 'present']) declare.kaldi_existed() if ignore is not None: declare.is_valid_string("ignore", ignore) with FileHandleManager() as fhm: if ignore is None: if type_name(hyp) == "Transcription": hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") hyp.save(hypTemp) hyp = hypTemp.name if type_name(ref) == "Transcription": refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") ref.save(refTemp) ref = refTemp.name cmd = f'compute-wer --text --mode={mode} ark:{ref} ark,p:{hyp}' scoreOut, scoreErr, _ = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") else: # remove the ingored symbol in hyp if type_name(hyp) == "Transcription": hyp = hyp.save() else: with open(hyp, "r", encoding="utf-8") as fr: hyp = fr.read() hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") cmd = f'sed "s/{ignore} //g" > {hypTemp.name}' hypOut, err, _ = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=hyp) if len(hypOut) == 0: raise WrongDataFormat("<hyp> has wrong data formation.", err.decode()) # remove the ingored symbol in ref if type_name(ref) == "Transcription": ref = ref.save() else: with open(ref, "r", encoding="utf-8") as fr: ref = fr.read() refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") cmd = f'sed "s/{ignore} //g" > {refTemp.name}' refOut, err, cod = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=ref) if cod != 0 or len(refOut) == 0: raise WrongDataFormat("<ref> has wrong data formation.", err.decode()) # score cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}' scoreOut, scoreErr, _ = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if len(scoreOut) == 0: raise KaldiProcessError("Failed to compute WER.", scoreErr.decode()) else: out = scoreOut.decode().split("\n") pattern1 = '%WER (.*) \[ (.*) \/ (.*),(.*) ins,(.*) del,(.*) sub \]' pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]" pattern3 = "Scored (.*) sentences,(.*) not present in hyp." s1 = re.findall(pattern1, out[0])[0] s2 = re.findall(pattern2, out[1])[0] s3 = re.findall(pattern3, out[2])[0] return namedtuple("Score", [ "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences", "wrongSentences", "missedSentences" ])( float(s1[0]), #WER int(s1[2]), #words int(s1[3]), #ins int(s1[4]), #del int(s1[5]), #sub float(s2[0]), #SER int(s2[1]), #sentences int(s2[2]), #wrong sentences int(s3[1]) #missed sentences )
def edit_distance(ref, hyp, ignore=None, mode='present'): ''' Compute edit-distance score. Args: <ref>,<hyp>: exkaldi Transcription objects. <ignore>: Ignoring specific symbols. <mode>: When both are Transcription objects,if mode is 'present',skip the missed utterances. Return: a namedtuple object including score information. ''' declare.is_potential_transcription("ref", ref) declare.is_potential_transcription("hyp", hyp) declare.is_instances("mode", mode, ['all', 'present']) if ignore is not None: declare.is_valid_string("ignore", ignore) if isinstance(ref, str): ref = load_transcription(ref) if isinstance(hyp, str): hyp = load_transcription(hyp) allED = 0 words = 0 SER = 0 sentences = 0 wrongSentences = 0 missedSentences = 0 ref = ref.sort() hyp = hyp.sort() for utt, hypTrans in hyp.items(): try: refTrans = ref[utt] except KeyError as e: if mode == "all": raise Exception( "Missing transcription in reference,set <mode> as 'all' to skip it." ) else: missedSentences += 1 else: sentences += 1 refTrans = refTrans.split() hypTrans = hypTrans.split() ed, wds = pure_edit_distance(refTrans, hypTrans, ignore=ignore) allED += ed words += wds if ed > 0: wrongSentences += 1 if sentences == 0: raise Exception( "Missing all transcription in reference. We don't think it's a reasonable result. Check the file please." ) return namedtuple("Score", [ "editDistance", "words", "SER", "sentences", "wrongSentences", "missedSentences" ])(allED, words, wrongSentences / sentences, sentences, wrongSentences, missedSentences)
def train_ngrams_srilm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. If you don't specified the discount by the <config> option, We defaultly use "kndiscount". Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations. Also you can run shell command "ngram-count" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) # verify the max order declare.less_equal("order", order, "max order", 9) # prepare srilm tool ExkaldiInfo.prepare_srilm() with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name unkSymbol = lexicons("oov") wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" ' if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate N-Grams language model.') return outFile
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations. Also you can run shell command "lmplz" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) declare.less_equal("order", order, "max order", 9) with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name extraConfig = " " if config is not None: if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configuration so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words_count = math.ceil(len(words) / 10) * 10 words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to generate arpa file.") return outFile