def decompress_gz_file(filePath, overWrite=False): ''' Decompress a gz file. Args: <filePath>: file path. Return: the absolute path of decompressed file. ''' assert isinstance( filePath, str), f"<filePath> must be a string but got {type_name(filePath)}." filePath = filePath.rstrip() if not os.path.isfile(filePath): raise WrongPath(f"No such file:{filePath}.") elif not filePath.endswith(".gz"): raise WrongOperation(f"{filePath}: Unknown suffix.") outFile = filePath[:-3] if overWrite is True and os.path.isfile(outFile): os.remove(outFile) cmd = f"gzip -d {filePath}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if cod != 0: print(err.decode()) raise ShellProcessError("Failed to decompress file.") else: return os.path.abspath(outFile)
def compress_gz_file(filePath, overWrite=False): ''' Compress a file to gz file. Args: <filePath>: file path. <overWrite>: If True, overwrite gz file when it is existed. Return: the absolute path of compressed file. ''' assert isinstance( filePath, str), f"<filePath> must be a string but got {type_name(filePath)}." filePath = filePath.strip() if not os.path.isfile(filePath): raise WrongPath(f"No such file:{filePath}.") outFile = filePath + ".gz" if overWrite is True and os.path.isfile(outFile): os.remove(outFile) cmd = f"gzip {filePath}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if cod != 0: print(err.decode()) raise ShellProcessError("Failed to compress file.") else: return os.path.abspath(outFile)
def compress_gz_file(filePath, overWrite=False, keepSource=False): ''' Compress a file to gz file. Args: <filePath>: file path. <overWrite>: If True,overwrite gz file when it has existed. <keepSource>: If True,retain source file. Return: the path of compressed file. ''' declare.is_file("filePath", filePath) declare.is_bool("overWrite", overWrite) declare.is_bool("keepSource", keepSource) filePath = os.path.abspath(filePath) if filePath.endswith(".gz"): raise WrongOperation(f"Cannot compress a .gz file:{filePath}.") else: outFile = filePath + ".gz" if os.path.isfile(outFile): if overWrite is True: os.remove(outFile) else: raise WrongOperation( f"File has existed:{outFile}. If overwrite it,set option <overWrite>=True." ) if keepSource: cmd = f"gzip -k {filePath}" else: cmd = f"gzip {filePath}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if cod != 0: print(err.decode()) raise ShellProcessError("Failed to compress file.") else: return outFile
def view_kaldi_usage(toolName): ''' View the help information of specified kaldi command. Args: <toolName>: kaldi tool name. ''' declare.is_valid_string("toolName", toolName) cmd = toolName.strip().split() assert len( cmd ) == 1, f"<toolName> must only include one command name but got: {toolName}." cmd = cmd[0] cmd += " --help" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if cod != 0: print(err.decode()) raise ShellProcessError(f"Failed to get kaldi tool info: {toolName}.") else: print(err.decode())
def load_ali(target, aliType=None, name="ali", hmm=None): ''' Load alignment data. Args: <target>: Python dict object, bytes object, exkaldi alignment object, kaldi alignment file or .npy file. <aliType>: None, or one of 'transitionID', 'phoneID', 'pdfID'. It will return different alignment object. <name>: a string. <hmm>: file path or exkaldi HMM object. Return: exkaldi alignment data objects. ''' assert isinstance( name, str) and len(name) > 0, "Name shoud be a string avaliable." ExkaldiInfo.vertify_kaldi_existed() def transform(data, cmd): out, err, cod = run_shell_command(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, inputs=data) if (isinstance(cod, int) and cod != 0) and out == b'': print(err.decode()) raise KaldiProcessError('Failed to transform alignment.') else: result = {} sp = BytesIO(out) for line in sp.readlines(): line = line.decode() line = line.strip().split() utt = line[0] matrix = np.array(line[1:], dtype=np.int32) result[utt] = matrix return results if isinstance(target, dict): if aliType is None: result = NumpyAlignment(target, name) elif aliType == "transitionID": result = NumpyAlignmentTrans(target, name) elif aliType == "phoneID": result = NumpyAlignmentPhone(target, name) elif aliType == "pdfID": result = NumpyAlignmentPdf(target, name) else: raise WrongOperation( f"<aliType> should be None, 'transitionID', 'phoneID' or 'pdfID' but got {aliType}." ) result.check_format() return result elif type_name(target) in [ "NumpyAlignment", "NumpyAlignmentTrans", "NumpyAlignmentPhone", "NumpyAlignmentPdf", "BytesAlignmentTrans" ]: result = copy.deepcopy(target) result.rename(name) return result elif isinstance(target, str): allFiles = list_files(target) results = { "NumpyAlignment": NumpyAlignment(), "NumpyAlignmentTrans": NumpyAlignmentTrans(), "NumpyAlignmentPhone": NumpyAlignmentPhone(), "NumpyAlignmentPdf": NumpyAlignmentPdf(), "BytesAlignmentTrans": BytesAlignmentTrans(), } for fileName in allFiles: fileName = os.path.abspath(fileName) if fileName.endswith(".npy"): temp = __read_data_from_file(fileName, "npy") if aliType is None: temp = NumpyAlignment(temp.data) results["NumpyAlignment"] += temp elif aliType == "transitionID": temp = NumpyAlignmentTrans(temp.data) results["NumpyAlignmentTrans"] += temp elif aliType == "phoneID": temp = NumpyAlignmentPhone(temp.data) results["NumpyAlignmentPhone"] += temp elif aliType == "pdfID": temp = NumpyAlignmentPdf(temp.data) results["NumpyAlignmentPdf"] += temp else: raise WrongOperation( f"<aliType> should be None, 'transitionID','phoneID' or 'pdfID' but got {aliType}." ) else: if fileName.endswith('.gz'): cmd = f'gunzip -c {fileName}' else: cmd = f'cat {fileName}' if aliType is None or aliType == "transitionID": out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or out == b'': print(err.decode()) raise ShellProcessError( "Failed to get the alignment data from file.") else: temp = BytesAlignmentTrans(out) results["BytesAlignmentTrans"] += temp else: temp = tempfile.NamedTemporaryFile("wb+") try: if type_name(hmm) in ("HMM", "MonophoneHMM", "TriphoneHMM"): hmm.save(temp) hmmFileName = temp.name elif isinstance(hmm, str): if not os.path.isfile(hmm): raise WrongPath(f"No such file:{hmm}.") hmmFileName = hmm else: raise UnsupportedType( f"<hmm> should be a filePath or exkaldi HMM and its sub-class object. but got {type_name(hmm)}." ) if aliType == "phoneID": cmd += f" | ali-to-phones --per-frame=true {hmmFileName} ark:- ark,t:-" temp = transform(None, cmd) temp = NumpyAlignmentPhone(temp) results["NumpyAlignmentPhone"] += temp elif target == "pdfID": cmd = f" | ali-to-pdf {hmmFileName} ark:- ark,t:-" temp = transform(None, cmd) temp = NumpyAlignmentPdf(temp) results["NumpyAlignmentPdf"] += temp else: raise WrongOperation( f"<target> should be 'trainsitionID', 'phoneID' or 'pdfID' but got {target}." ) finally: temp.close() finalResult = [] for obj in results.values(): if not obj.is_void: obj.rename(name) finalResult.append(obj) if len(finalResult) == 0: raise WrongOperation( "<target> dose not include any data avaliable.") elif len(finalResult) == 1: finalResult = finalResult[0] return finalResult
def train_ngrams_srilm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. If you don't specified the discount by the <config> option, We defaultly use "kndiscount". Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations. Also you can run shell command "ngram-count" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) # verify the max order declare.less_equal("order", order, "max order", 9) # prepare srilm tool ExkaldiInfo.prepare_srilm() with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name unkSymbol = lexicons("oov") wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" ' if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate N-Grams language model.') return outFile
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations. Also you can run shell command "lmplz" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) declare.less_equal("order", order, "max order", 9) with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name extraConfig = " " if config is not None: if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configuration so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words_count = math.ceil(len(words) / 10) * 10 words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to generate arpa file.") return outFile
def load_ali(target, aliType="transitionID", name="ali", hmm=None): ''' Load alignment data. Args: <target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file. <aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object. <name>: a string. <hmm>: file path or exkaldi HMM object. Return: exkaldi alignment objects. ''' declare.is_valid_string("name", name) declare.is_instances("aliType", aliType, [None, "transitionID", "phoneID", "pdfID"]) declare.kaldi_existed() def transform(data, cmd): out, err, cod = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=data) if (isinstance(cod, int) and cod != 0) and out == b'': print(err.decode()) raise KaldiProcessError('Failed to transform alignment.') else: result = {} sp = BytesIO(out) for line in sp.readlines(): line = line.decode() line = line.strip().split() utt = line[0] matrix = np.array(line[1:], dtype=np.int32) result[utt] = matrix return result if isinstance(target, dict): if aliType is None: result = NumpyAlignment(target, name) elif aliType == "transitionID": result = NumpyAlignmentTrans(target, name) elif aliType == "phoneID": result = NumpyAlignmentPhone(target, name) elif aliType == "pdfID": result = NumpyAlignmentPdf(target, name) else: raise WrongOperation( f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}." ) result.check_format() return result elif isinstance( target, (NumpyAlignment, NumpyAlignmentTrans, BytesAlignmentTrans)): result = copy.deepcopy(target) result.rename(name) return result elif isinstance(target, ArkIndexTable): result = target.fetch(arkType="ali") if aliType in ["phoneID", "pdfID"]: result = result.to_numpy(aliType, hmm) result.rename(name) return result elif isinstance(target, str): allFiles = list_files(target) numpyAli = {} bytesAli = [] for fileName in allFiles: fileName = fileName.strip() if fileName.endswith(".npy"): try: temp = np.load(fileName, allow_pickle=True) for utt, mat in temp: numpyAli[utt] = mat except: raise UnsupportedType( f'This is not a valid Exkaldi npy file: {fileName}.') else: if fileName.endswith('.gz'): cmd = f'gunzip -c {fileName}' else: cmd = f'cat {fileName}' if aliType is None or aliType == "transitionID": out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0) or out == b'': print(err.decode()) raise ShellProcessError( f"Failed to get the alignment data from file: {fileName}." ) else: bytesAli.append(out) else: with FileHandleManager() as fhm: declare.is_potential_hmm("hmm", hmm) if not isinstance(hmm, str): hmmTemp = fhm.create("wb+") hmm.save(hmmTemp) hmm = hmmTemp.name if aliType == "phoneID": cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-" temp = transform(None, cmd) else: cmd = f" | ali-to-pdf {hmm} ark:- ark,t:-" temp = transform(None, cmd) numpyAli.update(temp) bytesAli = b"".join(bytesAli) if aliType is None: if len(numpyAli) == 0: return BytesAlignmentTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAlignment(numpyAli, name=name) else: result = NumpyAlignmentTrans( numpyAli) + BytesAlignmentTrans(bytesAli) result.rename(name) return result elif aliType == "transitionID": if len(numpyAli) == 0: return BytesAlignmentTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAlignmentTrans(numpyAli, name=name) else: result = NumpyAlignmentTrans( numpyAli) + BytesAlignmentTrans(bytesAli) result.rename(name) return result elif aliType == "phoneID": return NumpyAlignmentPhone(numpyAli, name=name) else: return NumpyAlignmentPdf(numpyAli, name=name) else: raise UnsupportedType( f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}." )
def train_ngrams_srilm(lexicons, order, textFile, outFile, config=None): ''' Train n-grams language model with Srilm tookit. Args: <lexicons>: words.txt file path or Exkaldi LexiconBank object. <order>: the maxinum order of n-grams. <textFile>: text corpus file. <outFile>: ARPA out file name. <config>: configures, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get configure information that you can set. Also you can run shell command "lmplz" to look their meaning. ''' assert isinstance( order, int ) and order > 0 and order < 10, "Expected <n> is a positive int value and it must be smaller than 10." assert isinstance(textFile, str), "Expected <textFile> is name-like string." assert isinstance(outFile, str), "Expected <outFile> is name-like string." assert type_name( lexicons ) == "LexiconBank", f"Expected <lexicons> is exkaldi LexiconBank object but got {type_name(lexicons)}." ExkaldiInfo.prepare_srilm() if not os.path.isfile(textFile): raise WrongPath(f"No such file:{textFile}") else: ## Should check the numbers of lines cmd = f"shuf {textFile} -n 100" out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError("Failed to sample from text file.") elif out == b'': raise WrongDataFormat("Void text file.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or extremely short." ) wordlist = tempfile.NamedTemporaryFile("w+", encoding='utf-8', suffix=".txt") unkSymbol = lexicons("oov") try: lexiconp = lexicons("lexiconp") words = [x[0] for x in lexiconp.keys()] wordlist.write("\n".join(words)) wordlist.seek(0) #cmd2 = f"ngram-count -text {textFile} -order {order}" extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f"ngram-count -text {textFile} -order {order} -limit-vocab -vocab {wordlist.name} -unk -map-unk {unkSymbol} " if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate ngrams language model.') else: return os.path.abspath(outFile) finally: wordlist.close()
def train_ngrams_kenlm(lexicons, order, textFile, outFile, config=None): ''' Train n-grams language model with KenLm tookit. Args: <lexicons>: words.txt file path or Exkaldi LexiconBank object. <order>: the maxinum order of n-grams. <textFile>: text corpus file. <outFile>: ARPA out file name. <config>: configures, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get configure information that you can set. Also you can run shell command "lmplz" to look their meaning. ''' assert isinstance( order, int ) and 0 < order <= 6, "We support maximum 6-grams LM in current version." if not os.path.isfile(textFile): raise WrongPath("No such file:{}".format(textFile)) else: ## Should check the numbers of lines cmd = f"shuf {textFile} -n 100" out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError("Failed to sample from text file.") elif out == b'': raise WrongDataFormat("Void text file.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or extremely short." ) extraConfig = " " if config != None: assert isinstance( config, dict ), f"<config> should be dict object but got: {type_name(config)}." if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configure so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configure so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " assert isinstance(outFile, str), f"<outFile> should be a string." if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) words = tempfile.NamedTemporaryFile("w+", suffix=".txt", encoding="utf-8") try: if type_name(lexicons) == "LexiconBank": ws = lexicons("words") words_count = math.ceil(len(ws) / 10) * 10 ws = "\n".join(ws.keys()) elif isinstance(lexicons, str): if not os.path.isfile(lexicons): raise WrongPath(f"No such file:{lexicons}.") with open(lexicons, "r", encoding="utf-8") as fr: lines = fr.readlines() ws = [] for line in lines: line = line.strip().split(maxsplit=1) if len(line) < 1: continue else: ws.append(line[0]) words_count = math.ceil(len(ws) / 10) * 10 ws = "\n".join(ws) else: raise UnsupportedType( "<lexicons> should be LexiconBank object or file path.") words.write(ws) words.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {textFile} --arpa {outFile} --limit_vocab_file {words.name}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) raise KenlmProcessError("Failed to generate arpa file.") else: return os.path.abspath(outFile) finally: words.close()