def __setattr__(self, name, value): if '_ArgumentParser__arguments' in self.__dict__.keys(): if "--"+name in self.__arguments.keys() and name in self.__dict__.keys(): # verify value proto = self.__arguments[name] if isinstance(value, (list,tuple)): temp = value else: temp = [value,] for v in temp: assert isinstance(v, proto.dtype), f"<{name}> need {proto.dtype.__name__} type value but got: {value}." if proto.choices is not None: assert v in proto.choices, f"<{name}> should be one of {proto.choices} but got: {value}." if proto.minV is not None: declare.greater_equal(f"option value of {name}", v, "minimum expected value", proto.minV) if proto.maxV is not None: declare.less_equal(f"option value of {name}", v, "maximum expected value", proto.maxV) # modify the backup self.__arguments[name] = proto._replace(value=value) super().__setattr__(name, value)
def train_ngrams_srilm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. If you don't specified the discount by the <config> option, We defaultly use "kndiscount". Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations. Also you can run shell command "ngram-count" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) # verify the max order declare.less_equal("order", order, "max order", 9) # prepare srilm tool ExkaldiInfo.prepare_srilm() with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name unkSymbol = lexicons("oov") wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" ' if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate N-Grams language model.') return outFile
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations. Also you can run shell command "lmplz" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) declare.less_equal("order", order, "max order", 9) with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name extraConfig = " " if config is not None: if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configuration so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words_count = math.ceil(len(words) / 10) * 10 words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to generate arpa file.") return outFile
def add(self,name,dtype,abbr=None,default=None,choices=None,minV=None,maxV=None,discription=None): ''' Add a new option. Args: _name_: a string which must have a format such as "--exkaldi" (but "--help" is inavaliable exceptionally.). _dtype_: float, int, str or bool. _abbr_: None or a abbreviation of name which must have a format such as "-e" (but "-h" is inavaliable exceptionally.). _dtype_: the default value or a list/tuple of values. _choices_: a list/tuple of values. _minV_: set the minimum value if dtype is int or float. Enable when _choices_ is None. _maxV_: set the maximum value if dtype is int or float. Enable when _choices_ is None. _maxV_: a string to discribe this option. ''' self.__capture() # check option name declare.is_valid_string("name",name) name = name.strip() self.__detect_special_char(name) assert name[0:2] == "--" and name[2:3] != "-", f"Option name must start with '--' but got: {name}." assert name != "--help", "Option name is inavaliable: --help." if name in self.__arguments.keys(): raise WrongOperation(f"Option name has existed: {name}.") # check dtype declare.is_instances("option dtype", dtype, (float,int,bool,str)) # check abbreviation if abbr is not None: declare.is_valid_string("abbr",abbr) abbr = abbr.strip() self.__detect_special_char(abbr) assert abbr[0:1] == "-" and abbr[1:2] != "-", f"Abbreviation must start with '-' but got: {abbr}." assert abbr != "-h", "Abbreviation is inavaliable: -h." if abbr in self.__abb2Name.keys(): raise WrongOperation(f"Abbreviation has existed: {abbr}.") # check default value if default is not None: if isinstance(default,(list,tuple)): declare.members_are_classes(f"Default value of {name}", default, dtype) else: declare.is_classes(f"Default value of {name}", default, dtype) if dtype == str: self.__detect_special_char(default) # check choices if choices is not None: declare.is_classes(f"Choices of {name}", choices, (list,tuple)) declare.members_are_classes(f"Choices of {name}", choices, dtype) if dtype == str: self.__detect_special_char(choices) if default is not None: if isinstance(default,(list,tuple)): declare.members_are_instances(f"Default value of {name}", default, choices) else: declare.is_instances(f"Default value of {name}", default, choices) # check boundary values if minV is not None or maxV is not None: assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}." assert choices is None, f"Cannot set choices and boundary concurrently: {name}." if minV is not None: declare.is_classes(f"Minimum value of {name}", minV, dtype) if default is not None: if isinstance(default, (list,tuple)): for v in default: declare.greater_equal(f"Default value of {name}", v, "minimum expected value", minV) else: declare.greater_equal(f"Default of {name}", default, "minimum expected value", minV) if maxV is not None: declare.is_classes(f"Maximum value of {name}", maxV, dtype) if default is not None: if isinstance(default,(list,tuple)): for v in default: declare.less_equal(f"Default value of {name}", v, "maximum expected value", maxV) else: declare.less_equal(f"Default value of {name}", default, "maximum expected value", maxV) if minV is not None and maxV is not None: declare.less_equal(f"Minimum value of {name}", minV, f"maximum value", maxV) # check discription if discription is not None: declare.is_valid_string(f"Discription of {name}", discription) self.__detect_special_char(discription) self.__arguments[name] = self.spec(dtype,default,choices,minV,maxV,discription) self.__name2Abb[name] = abbr if abbr is not None: self.__abb2Name[abbr] = name
def load(self, filePath): ''' Load auguments from file. Args: _filePath_: args file path. ''' declare.is_file("filePath", filePath) self.reset() with open(filePath, "r", encoding="utf-8") as fr: lines = fr.read() lines = lines.strip() if len(lines) == 0: raise WrongOperation(f"This is a void file: {filePath}.") blocks = lines.split("\n\n") def __parse(name, value, dtype): if dtype in [float,int]: try: value = dtype(value) except ValueError: raise WrongOperation(f"Option <{name}> need a {dtype.__name__} value but choices got: {value}.") elif dtype == bool: if value.lower() == "true": value = True elif c.lower() == "false": value = False else: raise WrongOperation(f"Option <{name}> need a bool value but choices got: {value}.") return value self.__discription = blocks[0].strip() for blockNo, block in enumerate(blocks[1:], start=1): block = block.strip() if len(block) == 0: continue block = block.split("\n") # 1. match options values = {"name":None,"abbr":None,"dtype":None,"default":None,"choices":None,"minV":None,"maxV":None,"discription":None,"value":None} for m in block: m = m.strip() assert "=" in m, f"Augument should has format: key = value, but got: {m}." assert len(m.split("=")) == 2, f"Augument should has format: key = value, but got: {m}." m = m.split("=") name = m[0].strip() value = m[1].strip() declare.is_instances("Option key", name, list(values.keys())) values[name] = value for key, value in values.items(): assert value is not None, f"Missed {key} information in line: {lineNo}." # 2. parse name = values["name"] # parse the dtype firstly declare.is_instances("dtype", values["dtype"], ["float","int","bool","str"]) values["dtype"] = eval(values["dtype"]) dtype = values["dtype"] # then parse the choices choices = values["choices"] if choices in ["none", "None"]: choices = None else: choices = choices.split("|") for i, c in enumerate(choices): choices[i] = __parse(name, c, dtype) values["choices"] = choices # then parse the boundary value boundary = {"minV":None, "maxV":None} for i in boundary.keys(): V = values[i] if V not in ["none", "None"]: assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}:" assert choices is None, f"{name} cannot set choices and boundary concurrently." toIntFlag = True toFloatFlag = True try: float(V) except ValueError: toFloatFlag= False try: int(V) except ValueError: toIntFlag= False if toIntFlag is False and toFloatFlag is False: raise WrongDataFormat(f"Boundary values of {name} should be a int or float value but got: {V}.") elif toIntFlag is False and toFloatFlag is True: # minV is predicted be a float value if dtype != float: raise WrongDataFormat(f"{name}'s dtype is int but try to set boundary value with a float value: {V}.") else: V = float(V) elif toIntFlag is True and toFloatFlag is True: # minV is predicted be a float or an int value V = dtype(V) else: raise WrongDataFormat(f"Failed to set {name}'s boundary value: {V}.") boundary[i] = V values["minV"] = boundary["minV"] values["maxV"] = boundary["maxV"] # then parse the default and value if values["default"].lower() == "none": values["default"] = None else: default = values["default"].split("|") for i, v in enumerate(default): default[i] = __parse(name, v, dtype) values["default"] = default if len(default) > 1 else default[0] # the judgement of "default" will be done by .parse() function, so here we only verify "value" if values["value"].lower() == "none": values["value"] = None else: value = values["value"].split("|") for i, v in enumerate(value): v = __parse(name, v, dtype) if values["choices"] is not None: declare.is_instances("Option value", v, values["choices"]) else: if values["minV"] is not None: declare.greater_equal("Option value", v, "minimum expected value", values["minV"]) if values["maxV"] is not None: declare.less_equal("Option value", v, "maximum expected value", values["maxV"]) value[i] = v if len(value) == 1: value = value[0] values["value"] = value # check abbreviation if values["abbr"] in ["none", "None"]: values["abbr"] = None # add this options self.add(name=values["name"], dtype=values["dtype"], abbr=values["abbr"], default=values["default"], choices=values["choices"], minV=values["minV"], maxV=values["maxV"], discription=values["discription"] ) # finally, modify the "value" self.__arguments[values["name"]] = self.__arguments[values["name"]]._replace(value=values["value"]) if values["value"] is not None: self.__setattr__(values["name"], values["value"])
def parse(self): ''' Start to parse arguments. ''' self.__capture() # extract arguments temp = self.__argv.copy() temp.reverse() newArgv = [] for a in temp: if a.endswith(".py"): break a = a.split("=") a.reverse() newArgv.extend( a ) # match these arguments result = dict( (key, proto.default) for key, proto in self.__arguments.items() ) for i, op in enumerate(newArgv): if op[0:1] == "-" and op[1:2] != "-": if op == "-h": self.print_help_and_exit() if op not in self.__abb2Name.keys(): raise WrongOperation(f"Option has not been defined: {op}.") else: op = self.__abb2Name[op] if op.startswith("--"): if op == "--help": self.print_help_and_exit() if op not in self.__arguments.keys(): raise WrongOperation(f"Option has not been defined: {op}.") if i%2 == 0: raise WrongOperation(f"Missed value for option: {op}.") # option value might has a format such as: 1|2 vs = newArgv[i-1].split("|") proto = self.__arguments[op] if proto.dtype in [float,int]: try: for i,v in enumerate(vs): vs[i] = proto.dtype(v) except ValueError: raise WrongOperation(f"Option <{op}> need a {proto.dtype.__name__} value but got: {v}.") elif proto.dtype == bool: for i,v in enumerate(vs): if v.lower() == "true": v = True elif v.lower() == "false": v = False else: raise WrongOperation(f"Option <{op}> need a bool value but got: {v}.") vs[i] = v # vs become a list if proto.choices is not None: declare.members_are_instances(f"Option value of {op}", vs, proto.choices) else: if proto.minV is not None: for v in vs: declare.greater_equal(f"Option value of {op}", v, "minimum expected value", proto.minV) if proto.maxV is not None: for v in vs: declare.less_equal(f"Option value of {op}", v, "maximum expected value", proto.maxV) result[op] = vs if len(vs) > 1 else vs[0] # set attributes for name, value in result.items(): if value is None: raise WrongOperation(f"Missed value for option: {name}.") else: self.__arguments[name] = self.__arguments[name]._replace(value=value) self.__setattr__(name[2:], value)